tasks/multimedia-archive: add support for category RSS attribute

This commit is contained in:
Gabriel Arazas 2022-11-21 10:38:15 +08:00
parent ddb39df858
commit 09abb36ad6

View File

@ -28,13 +28,17 @@
# for the structure for these various applications so I'm taking the simplest # for the structure for these various applications so I'm taking the simplest
# way. # way.
# #
# For now, this scripts extracts categories based from the structure of the
# outline and the `category` attribute of each RSS node.
#
# Welp, the disadvantage of OPML being a very flexible format it seems. :( # Welp, the disadvantage of OPML being a very flexible format it seems. :(
import argparse import argparse
import copy
import json import json
import re import re
import sys import sys
from typing import Dict from typing import Dict, Set
from xml.etree import ElementTree from xml.etree import ElementTree
FALLBACK_CATEGORY = "Uncategorized" FALLBACK_CATEGORY = "Uncategorized"
@ -63,6 +67,7 @@ class Subscription(object):
self.name = name self.name = name
self.url = url self.url = url
self.description = description self.description = description
self.categories = []
class Outline(object): class Outline(object):
@ -156,6 +161,22 @@ class Outline(object):
if description is not None: if description is not None:
subscription.description = description subscription.description = description
# Only get the first category hierarchy from the attribute.
# Similar behavior to how categories are extracted from the
# outline elements. Consistency!
for category in outline.get("category", "").strip().split(","):
# If empty string or whatever falsey value this will have.
category = category.strip()
if not category:
continue
category_hierarchy = filter(lambda split: split.strip(), category.split("/"))
first_category_split = first(None, category_hierarchy)
if first_category_split is None:
continue
subscription.categories.append(first_category_split)
inner_outline.add_subscription(subscription) inner_outline.add_subscription(subscription)
root_outline.add_child( root_outline.add_child(
@ -173,12 +194,29 @@ def list_categories_from_outline(root_outline: Outline):
for child in root_outline.children: for child in root_outline.children:
title = FALLBACK_CATEGORY if child.title is None else child.title title = FALLBACK_CATEGORY if child.title is None else child.title
data.add(title) data.add(title)
return list(data)
def recurse(root_outline: Outline, data: Set = set()):
for subscription in root_outline.subscriptions:
for category in subscription.categories:
data.add(category)
for child in root_outline.children:
recurse(child, data)
return data
data = recurse(root_outline, data)
return sorted(data)
def create_jobs_from_outline(root_outline: Outline, categories=[]): def create_jobs_from_outline(root_outline: Outline, categories=[]):
data = {} data = {}
DATA_TEMPLATE = {
"extraArgs": [],
"subscriptions": [],
}
def recurse(outline: Outline, category=None, data={}, depth=1): def recurse(outline: Outline, category=None, data={}, depth=1):
# We're only using the top-level outline titles as the category. # We're only using the top-level outline titles as the category.
if depth == 2: if depth == 2:
@ -188,14 +226,24 @@ def create_jobs_from_outline(root_outline: Outline, categories=[]):
if depth == 1 or category is None: if depth == 1 or category is None:
category = FALLBACK_CATEGORY category = FALLBACK_CATEGORY
if category not in data: data.setdefault(category, copy.deepcopy(DATA_TEMPLATE))
data[category] = {
"extraArgs": [],
"subscriptions": [],
}
for subscription in outline.subscriptions: for subscription in outline.subscriptions:
data[category]["subscriptions"].append(subscription) # There are some things that are meant not to be shown (i.e.,
# `categories`) so we're putting it in a data template.
subscription_data = {
"name": subscription.name,
"url": subscription.url
}
if subscription.description:
subscription_data["description"] = subscription.description
data[category]["subscriptions"].append(subscription_data)
for sub_category in subscription.categories:
data.setdefault(sub_category, copy.deepcopy(DATA_TEMPLATE))
data[sub_category]["subscriptions"].append(subscription_data)
for child in outline.children: for child in outline.children:
recurse(child, category, data, depth + 1) recurse(child, category, data, depth + 1)
@ -277,7 +325,7 @@ if __name__ == "__main__":
"sort_keys": True, "sort_keys": True,
} }
if "output" in args: if args.output:
with open(args.output, mode="w") as output_file: with open(args.output, mode="w") as output_file:
json.dump(data, output_file, **json_dump_kwargs) json.dump(data, output_file, **json_dump_kwargs)
else: else: