From 09abb36ad6a12a70cb3c767742cbd760803f38fd Mon Sep 17 00:00:00 2001
From: Gabriel Arazas <foo.dogsquared@gmail.com>
Date: Mon, 21 Nov 2022 10:38:15 +0800
Subject: [PATCH] tasks/multimedia-archive: add support for `category` RSS
 attribute

---
 .../scripts/create-jobs-from-rss-opml.py      | 66 ++++++++++++++++---
 1 file changed, 57 insertions(+), 9 deletions(-)

diff --git a/modules/nixos/tasks/multimedia-archive/scripts/create-jobs-from-rss-opml.py b/modules/nixos/tasks/multimedia-archive/scripts/create-jobs-from-rss-opml.py
index d0a32e39..2cb82bba 100755
--- a/modules/nixos/tasks/multimedia-archive/scripts/create-jobs-from-rss-opml.py
+++ b/modules/nixos/tasks/multimedia-archive/scripts/create-jobs-from-rss-opml.py
@@ -28,13 +28,17 @@
 # for the structure for these various applications so I'm taking the simplest
 # way.
 #
+# For now, this scripts extracts categories based from the structure of the
+# outline and the `category` attribute of each RSS node.
+#
 # Welp, the disadvantage of OPML being a very flexible format it seems. :(
 
 import argparse
+import copy
 import json
 import re
 import sys
-from typing import Dict
+from typing import Dict, Set
 from xml.etree import ElementTree
 
 FALLBACK_CATEGORY = "Uncategorized"
@@ -63,6 +67,7 @@ class Subscription(object):
         self.name = name
         self.url = url
         self.description = description
+        self.categories = []
 
 
 class Outline(object):
@@ -156,6 +161,22 @@ class Outline(object):
                     if description is not None:
                         subscription.description = description
 
+                    # Only get the first category hierarchy from the attribute.
+                    # Similar behavior to how categories are extracted from the
+                    # outline elements. Consistency!
+                    for category in outline.get("category", "").strip().split(","):
+                        # If empty string or whatever falsey value this will have.
+                        category = category.strip()
+                        if not category:
+                            continue
+
+                        category_hierarchy = filter(lambda split: split.strip(), category.split("/"))
+                        first_category_split = first(None, category_hierarchy)
+                        if first_category_split is None:
+                            continue
+
+                        subscription.categories.append(first_category_split)
+
                     inner_outline.add_subscription(subscription)
 
                 root_outline.add_child(
@@ -173,12 +194,29 @@ def list_categories_from_outline(root_outline: Outline):
     for child in root_outline.children:
         title = FALLBACK_CATEGORY if child.title is None else child.title
         data.add(title)
-    return list(data)
+
+    def recurse(root_outline: Outline, data: Set = set()):
+        for subscription in root_outline.subscriptions:
+            for category in subscription.categories:
+                data.add(category)
+
+        for child in root_outline.children:
+            recurse(child, data)
+
+        return data
+
+    data = recurse(root_outline, data)
+    return sorted(data)
 
 
 def create_jobs_from_outline(root_outline: Outline, categories=[]):
     data = {}
 
+    DATA_TEMPLATE = {
+        "extraArgs": [],
+        "subscriptions": [],
+    }
+
     def recurse(outline: Outline, category=None, data={}, depth=1):
         # We're only using the top-level outline titles as the category.
         if depth == 2:
@@ -188,14 +226,24 @@ def create_jobs_from_outline(root_outline: Outline, categories=[]):
         if depth == 1 or category is None:
             category = FALLBACK_CATEGORY
 
-        if category not in data:
-            data[category] = {
-                "extraArgs": [],
-                "subscriptions": [],
-            }
+        data.setdefault(category, copy.deepcopy(DATA_TEMPLATE))
 
         for subscription in outline.subscriptions:
-            data[category]["subscriptions"].append(subscription)
+            # There are some things that are meant not to be shown (i.e.,
+            # `categories`) so we're putting it in a data template.
+            subscription_data = {
+                "name": subscription.name,
+                "url": subscription.url
+            }
+
+            if subscription.description:
+                subscription_data["description"] = subscription.description
+
+            data[category]["subscriptions"].append(subscription_data)
+
+            for sub_category in subscription.categories:
+                data.setdefault(sub_category, copy.deepcopy(DATA_TEMPLATE))
+                data[sub_category]["subscriptions"].append(subscription_data)
 
         for child in outline.children:
             recurse(child, category, data, depth + 1)
@@ -277,7 +325,7 @@ if __name__ == "__main__":
                 "sort_keys": True,
             }
 
-            if "output" in args:
+            if args.output:
                 with open(args.output, mode="w") as output_file:
                     json.dump(data, output_file, **json_dump_kwargs)
             else: