nixos-config/modules/nixos/tasks/multimedia-archive/scripts/create-jobs-from-rss-opml.py

#!/usr/bin/env nix-shell
#! nix-shell -i python3 -p python3

# NOTE: If you're using Thunderbird and you're using a folder structure in your
# feeds folder where one folder == one feed. It's going to be messy for you.

# It creates a jobs database suitable for this task from a given OPML file
# typically exported through Thunderbird and newsboat. This script considers
# the outline hierarchy as part of tag hierarchy similar to Newsboat import
# script. Additionally, `categories` attribute in the element are also
# considered.
#
# Take note, this script considers the first outline with a 'title' or 'text'
# attribute containing other RSS nodes as the category name and that's it.
#
# Anywho, the following document URL at <http://opml.org/spec2.opml> is used as
# the basis for how OPML subscription lists work.
#
# This script **tries** to consider the common way how most applications export
# their OPML which is not great. The only applications I've ever inspected are
# OPMLs from Thunderbird, Feeder, and FreshFeed. Each with their own quirks and
# usage of several attributes compared to what is expected from the
# specification.
#
# For example, most applications I've used don't easily export categories into
# the 'category' attribute which is unfortunate. There seems to be little
# respect for the attribute. Not to mention, there could be many assumptions
# for the structure for these various applications so I'm taking the simplest
# way.
#
# For now, this scripts extracts categories based from the structure of the
# outline and the `category` attribute of each RSS node.
#
# Welp, the disadvantage of OPML being a very flexible format it seems. :(

import argparse
import copy
import json
import re
import sys
from typing import Dict, Set
from xml.etree import ElementTree

FALLBACK_CATEGORY = "Uncategorized"


# Very lazy implementation of kebab-casing. :)
def kebab_case(string):
    string = string.lower()
    string = re.sub(r"\s+", "-", string)
    string = re.sub("[^a-zA-Z0-9-]", "", string)
    string = re.sub("-+", "-", string)
    string = re.sub("^-|-$", "", string)
    return string


def first(function, iterable, default=None):
    """
    Returns the first value that passed the predicate function. Otherwise,
    return with the given default value.
    """
    return next(filter(function, iterable), default)


class Subscription(object):
    def __init__(self, name, url, description=""):
        self.name = name
        self.url = url
        self.description = description
        self.categories = []


class Outline(object):
    """An ``<outline>`` OPML element representation in Python."""

    def __init__(self, title=None, subscriptions=None, children=None):
        self.title = title
        self.subscriptions = []
        self.children = []

        if children is not None:
            for child in children:
                self.add_child(child)

        if subscriptions is not None:
            for subscription in subscriptions:
                self.add_subscription(subscription)

    def export(self) -> Dict:
        """Export the outline hierarchy as a dictionary."""

        SUBSCRIPTION_KEY = "__subscriptions__"
        CHILDREN_KEY = "__children__"

        def recurse(root: Outline, data={}, depth: int = 1):
            title = "root" if depth == 1 else root.title

            if title is None:
                title = FALLBACK_CATEGORY

            data[title] = {SUBSCRIPTION_KEY: [], CHILDREN_KEY: {}}

            for subscription in root.subscriptions:
                data[title][SUBSCRIPTION_KEY].append(subscription)

            for child in root.children:
                recurse(child, data[title][CHILDREN_KEY], depth + 1)

            return data

        return recurse(self, {})

    def add_child(self, child):
        assert isinstance(child, Outline)
        self.children.append(child)

    def add_subscription(self, subscription):
        assert isinstance(subscription, Subscription)
        self.subscriptions.append(subscription)

    @staticmethod
    def parse(opml_xml: ElementTree.ElementTree, max_depth: int | None = None):
        """
        Parse a given OPML as an ``ElementTree`` and return an ``Outline``
        instance out of it.
        """
        opml_body = opml_xml.find("./body")

        assert (
            opml_body is not None
        ), """
            Given OPML does not have a ``<body>`` element. It is most likely
            the OPML is not valid.
        """

        root_outline = Outline()

        def get_attributes(element: ElementTree.Element, attributes=[]):
            return first(
                lambda elem: elem is not None,
                map(lambda attr: element.get(attr, None), attributes),
            )

        def recurse(
            root_outline: Outline,
            element: ElementTree.Element,
            depth: int = 1,
            max_depth: int | None = None,
        ) -> Outline:
            outlines = element.iterfind("./outline")

            for outline in outlines:
                title = get_attributes(outline, ["title", "text"])
                inner_outline = Outline(title)

                node_type = outline.get("type")
                if node_type == "rss":
                    subscription = Subscription(title, outline.get("xmlUrl"))

                    description = outline.get("description")
                    if description is not None:
                        subscription.description = description

                    # Only get the first category hierarchy from the attribute.
                    # Similar behavior to how categories are extracted from the
                    # outline elements. Consistency!
                    for category in outline.get("category", "").strip().split(","):
                        # If empty string or whatever falsey value this will have.
                        category = category.strip()
                        if not category:
                            continue

                        category_hierarchy = filter(
                            lambda split: split.strip(), category.split("/")
                        )
                        first_category_split = first(None, category_hierarchy)
                        if first_category_split is None:
                            continue

                        subscription.categories.append(first_category_split)

                    inner_outline.add_subscription(subscription)

                root_outline.add_child(
                    recurse(inner_outline, outline, depth + 1, max_depth)
                )

            return root_outline

        return recurse(root_outline, opml_body, max_depth=max_depth)


def list_categories_from_outline(root_outline: Outline):
    data = set()

    for child in root_outline.children:
        title = FALLBACK_CATEGORY if child.title is None else child.title
        data.add(title)

    def recurse(root_outline: Outline, data: Set = set()):
        for subscription in root_outline.subscriptions:
            for category in subscription.categories:
                data.add(category)

        for child in root_outline.children:
            recurse(child, data)

        return data

    data = recurse(root_outline, data)
    return sorted(data)


def create_jobs_from_outline(root_outline: Outline, categories=[]):
    data = {}

    DATA_TEMPLATE = {
        "extraArgs": [],
        "subscriptions": [],
    }

    def recurse(outline: Outline, category=None, data={}, depth=1):
        # We're only using the top-level outline titles as the category.
        if depth == 2:
            category = outline.title

        # The root outline shouldn't have a title.
        if depth == 1 or category is None:
            category = FALLBACK_CATEGORY

        data.setdefault(category, copy.deepcopy(DATA_TEMPLATE))

        for subscription in outline.subscriptions:
            # There are some things that are meant not to be shown (i.e.,
            # `categories`) so we're putting it in a data template.
            subscription_data = {"name": subscription.name, "url": subscription.url}

            if subscription.description:
                subscription_data["description"] = subscription.description

            data[category]["subscriptions"].append(subscription_data)

            for sub_category in subscription.categories:
                data.setdefault(sub_category, copy.deepcopy(DATA_TEMPLATE))
                data[sub_category]["subscriptions"].append(subscription_data)

        for child in outline.children:
            recurse(child, category, data, depth + 1)

        return data

    data = recurse(root_outline, data=data)

    keys = list(data.keys())
    for category in keys:
        if category not in categories:
            del data[category]

    return data


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Create a job database from an OPML file."
    )
    parser.add_argument("file", metavar="OPML_FILE", help="The OPML file.")
    parser.add_argument(
        "categories",
        nargs="*",
        metavar="CATEGORY",
        help="A list of categories to be extracted. If no categories are given, assumes that all categories are to be extracted.",
    )
    parser.add_argument(
        "--list",
        "-l",
        action="store_true",
        help="List all categories from the given file.",
    )
    parser.add_argument(
        "--output",
        "-o",
        action="store",
        metavar="FILE",
        help="The file where the output will be written.",
    )
    parser.add_argument(
        "--with-others",
        action="store_true",
        help=f"List all uncategorized feeds into '{FALLBACK_CATEGORY}'.",
    )

    args = parser.parse_args()

    with open(args.file, mode="r") as f:
        opml_xml = ElementTree.parse(f)
        root_outline = Outline.parse(opml_xml)
        if args.list:
            for category in list_categories_from_outline(root_outline):
                print(category)
        else:
            categories = []

            # We're setting it up this way to prioritize arguments from stdin.
            if not sys.stdin.isatty():
                for line in sys.stdin:
                    categories.append(line.strip())

            if len(args.categories) > 0:
                categories = args.categories
            elif len(categories) == 0:
                categories = list_categories_from_outline(root_outline)

            data = create_jobs_from_outline(root_outline, categories)

            # Ehhh... Personal preference.
            keys = list(data.keys())
            for key in keys:
                data[kebab_case(key)] = data.pop(key)

            json_dump_kwargs = {
                "default": vars,
                "ensure_ascii": False,
                "indent": 2,
                "sort_keys": True,
            }

            if args.output:
                with open(args.output, mode="w") as output_file:
                    json.dump(data, output_file, **json_dump_kwargs)
            else:
                print(json.dumps(data, **json_dump_kwargs))

# vi:ft=python:ts=4