mirror of
https://github.com/foo-dogsquared/nixos-config.git
synced 2025-01-31 22:57:55 +00:00
334 lines
11 KiB
Python
Executable File
334 lines
11 KiB
Python
Executable File
#!/usr/bin/env nix-shell
|
|
#! nix-shell -i python3 -p python3
|
|
|
|
# NOTE: If you're using Thunderbird and you're using a folder structure in your
|
|
# feeds folder where one folder == one feed. It's going to be messy for you.
|
|
|
|
# It creates a jobs database suitable for this task from a given OPML file
|
|
# typically exported through Thunderbird and newsboat. This script considers
|
|
# the outline hierarchy as part of tag hierarchy similar to Newsboat import
|
|
# script. Additionally, `categories` attribute in the element are also
|
|
# considered.
|
|
#
|
|
# Take note, this script considers the first outline with a 'title' or 'text'
|
|
# attribute containing other RSS nodes as the category name and that's it.
|
|
#
|
|
# Anywho, the following document URL at <http://opml.org/spec2.opml> is used as
|
|
# the basis for how OPML subscription lists work.
|
|
#
|
|
# This script **tries** to consider the common way how most applications export
|
|
# their OPML which is not great. The only applications I've ever inspected are
|
|
# OPMLs from Thunderbird, Feeder, and FreshFeed. Each with their own quirks and
|
|
# usage of several attributes compared to what is expected from the
|
|
# specification.
|
|
#
|
|
# For example, most applications I've used don't easily export categories into
|
|
# the 'category' attribute which is unfortunate. There seems to be little
|
|
# respect for the attribute. Not to mention, there could be many assumptions
|
|
# for the structure for these various applications so I'm taking the simplest
|
|
# way.
|
|
#
|
|
# For now, this scripts extracts categories based from the structure of the
|
|
# outline and the `category` attribute of each RSS node.
|
|
#
|
|
# Welp, the disadvantage of OPML being a very flexible format it seems. :(
|
|
|
|
import argparse
|
|
import copy
|
|
import json
|
|
import re
|
|
import sys
|
|
from typing import Dict, Set
|
|
from xml.etree import ElementTree
|
|
|
|
FALLBACK_CATEGORY = "Uncategorized"
|
|
|
|
|
|
# Very lazy implementation of kebab-casing. :)
|
|
def kebab_case(string):
|
|
string = string.lower()
|
|
string = re.sub(r"\s+", "-", string)
|
|
string = re.sub("[^a-zA-Z0-9-]", "", string)
|
|
string = re.sub("-+", "-", string)
|
|
string = re.sub("^-|-$", "", string)
|
|
return string
|
|
|
|
|
|
def first(function, iterable, default=None):
|
|
"""
|
|
Returns the first value that passed the predicate function. Otherwise,
|
|
return with the given default value.
|
|
"""
|
|
return next(filter(function, iterable), default)
|
|
|
|
|
|
class Subscription(object):
|
|
def __init__(self, name, url, description=""):
|
|
self.name = name
|
|
self.url = url
|
|
self.description = description
|
|
self.categories = []
|
|
|
|
|
|
class Outline(object):
|
|
"""An ``<outline>`` OPML element representation in Python."""
|
|
|
|
def __init__(self, title=None, subscriptions=None, children=None):
|
|
self.title = title
|
|
self.subscriptions = []
|
|
self.children = []
|
|
|
|
if children is not None:
|
|
for child in children:
|
|
self.add_child(child)
|
|
|
|
if subscriptions is not None:
|
|
for subscription in subscriptions:
|
|
self.add_subscription(subscription)
|
|
|
|
def export(self) -> Dict:
|
|
"""Export the outline hierarchy as a dictionary."""
|
|
|
|
SUBSCRIPTION_KEY = "__subscriptions__"
|
|
CHILDREN_KEY = "__children__"
|
|
|
|
def recurse(root: Outline, data={}, depth: int = 1):
|
|
title = "root" if depth == 1 else root.title
|
|
|
|
if title is None:
|
|
title = FALLBACK_CATEGORY
|
|
|
|
data[title] = {SUBSCRIPTION_KEY: [], CHILDREN_KEY: {}}
|
|
|
|
for subscription in root.subscriptions:
|
|
data[title][SUBSCRIPTION_KEY].append(subscription)
|
|
|
|
for child in root.children:
|
|
recurse(child, data[title][CHILDREN_KEY], depth + 1)
|
|
|
|
return data
|
|
|
|
return recurse(self, {})
|
|
|
|
def add_child(self, child):
|
|
assert isinstance(child, Outline)
|
|
self.children.append(child)
|
|
|
|
def add_subscription(self, subscription):
|
|
assert isinstance(subscription, Subscription)
|
|
self.subscriptions.append(subscription)
|
|
|
|
@staticmethod
|
|
def parse(opml_xml: ElementTree.ElementTree, max_depth: int | None = None):
|
|
"""
|
|
Parse a given OPML as an ``ElementTree`` and return an ``Outline``
|
|
instance out of it.
|
|
"""
|
|
opml_body = opml_xml.find("./body")
|
|
|
|
assert (
|
|
opml_body is not None
|
|
), """
|
|
Given OPML does not have a ``<body>`` element. It is most likely
|
|
the OPML is not valid.
|
|
"""
|
|
|
|
root_outline = Outline()
|
|
|
|
def get_attributes(element: ElementTree.Element, attributes=[]):
|
|
return first(
|
|
lambda elem: elem is not None,
|
|
map(lambda attr: element.get(attr, None), attributes),
|
|
)
|
|
|
|
def recurse(
|
|
root_outline: Outline,
|
|
element: ElementTree.Element,
|
|
depth: int = 1,
|
|
max_depth: int | None = None,
|
|
) -> Outline:
|
|
outlines = element.iterfind("./outline")
|
|
|
|
for outline in outlines:
|
|
title = get_attributes(outline, ["title", "text"])
|
|
inner_outline = Outline(title)
|
|
|
|
node_type = outline.get("type")
|
|
if node_type == "rss":
|
|
subscription = Subscription(title, outline.get("xmlUrl"))
|
|
|
|
description = outline.get("description")
|
|
if description is not None:
|
|
subscription.description = description
|
|
|
|
# Only get the first category hierarchy from the attribute.
|
|
# Similar behavior to how categories are extracted from the
|
|
# outline elements. Consistency!
|
|
for category in outline.get("category", "").strip().split(","):
|
|
# If empty string or whatever falsey value this will have.
|
|
category = category.strip()
|
|
if not category:
|
|
continue
|
|
|
|
category_hierarchy = filter(
|
|
lambda split: split.strip(), category.split("/")
|
|
)
|
|
first_category_split = first(None, category_hierarchy)
|
|
if first_category_split is None:
|
|
continue
|
|
|
|
subscription.categories.append(first_category_split)
|
|
|
|
inner_outline.add_subscription(subscription)
|
|
|
|
root_outline.add_child(
|
|
recurse(inner_outline, outline, depth + 1, max_depth)
|
|
)
|
|
|
|
return root_outline
|
|
|
|
return recurse(root_outline, opml_body, max_depth=max_depth)
|
|
|
|
|
|
def list_categories_from_outline(root_outline: Outline):
|
|
data = set()
|
|
|
|
for child in root_outline.children:
|
|
title = FALLBACK_CATEGORY if child.title is None else child.title
|
|
data.add(title)
|
|
|
|
def recurse(root_outline: Outline, data: Set = set()):
|
|
for subscription in root_outline.subscriptions:
|
|
for category in subscription.categories:
|
|
data.add(category)
|
|
|
|
for child in root_outline.children:
|
|
recurse(child, data)
|
|
|
|
return data
|
|
|
|
data = recurse(root_outline, data)
|
|
return sorted(data)
|
|
|
|
|
|
def create_jobs_from_outline(root_outline: Outline, categories=[]):
|
|
data = {}
|
|
|
|
DATA_TEMPLATE = {
|
|
"extraArgs": [],
|
|
"subscriptions": [],
|
|
}
|
|
|
|
def recurse(outline: Outline, category=None, data={}, depth=1):
|
|
# We're only using the top-level outline titles as the category.
|
|
if depth == 2:
|
|
category = outline.title
|
|
|
|
# The root outline shouldn't have a title.
|
|
if depth == 1 or category is None:
|
|
category = FALLBACK_CATEGORY
|
|
|
|
data.setdefault(category, copy.deepcopy(DATA_TEMPLATE))
|
|
|
|
for subscription in outline.subscriptions:
|
|
# There are some things that are meant not to be shown (i.e.,
|
|
# `categories`) so we're putting it in a data template.
|
|
subscription_data = {"name": subscription.name, "url": subscription.url}
|
|
|
|
if subscription.description:
|
|
subscription_data["description"] = subscription.description
|
|
|
|
data[category]["subscriptions"].append(subscription_data)
|
|
|
|
for sub_category in subscription.categories:
|
|
data.setdefault(sub_category, copy.deepcopy(DATA_TEMPLATE))
|
|
data[sub_category]["subscriptions"].append(subscription_data)
|
|
|
|
for child in outline.children:
|
|
recurse(child, category, data, depth + 1)
|
|
|
|
return data
|
|
|
|
data = recurse(root_outline, data=data)
|
|
|
|
keys = list(data.keys())
|
|
for category in keys:
|
|
if category not in categories:
|
|
del data[category]
|
|
|
|
return data
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="Create a job database from an OPML file."
|
|
)
|
|
parser.add_argument("file", metavar="OPML_FILE", help="The OPML file.")
|
|
parser.add_argument(
|
|
"categories",
|
|
nargs="*",
|
|
metavar="CATEGORY",
|
|
help="A list of categories to be extracted. If no categories are given, assumes that all categories are to be extracted.",
|
|
)
|
|
parser.add_argument(
|
|
"--list",
|
|
"-l",
|
|
action="store_true",
|
|
help="List all categories from the given file.",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
"-o",
|
|
action="store",
|
|
metavar="FILE",
|
|
help="The file where the output will be written.",
|
|
)
|
|
parser.add_argument(
|
|
"--with-others",
|
|
action="store_true",
|
|
help=f"List all uncategorized feeds into '{FALLBACK_CATEGORY}'.",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
with open(args.file, mode="r") as f:
|
|
opml_xml = ElementTree.parse(f)
|
|
root_outline = Outline.parse(opml_xml)
|
|
if args.list:
|
|
for category in list_categories_from_outline(root_outline):
|
|
print(category)
|
|
else:
|
|
categories = []
|
|
|
|
# We're setting it up this way to prioritize arguments from stdin.
|
|
if not sys.stdin.isatty():
|
|
for line in sys.stdin:
|
|
categories.append(line.strip())
|
|
|
|
if len(args.categories) > 0:
|
|
categories = args.categories
|
|
elif len(categories) == 0:
|
|
categories = list_categories_from_outline(root_outline)
|
|
|
|
data = create_jobs_from_outline(root_outline, categories)
|
|
|
|
# Ehhh... Personal preference.
|
|
keys = list(data.keys())
|
|
for key in keys:
|
|
data[kebab_case(key)] = data.pop(key)
|
|
|
|
json_dump_kwargs = {
|
|
"default": vars,
|
|
"ensure_ascii": False,
|
|
"indent": 2,
|
|
"sort_keys": True,
|
|
}
|
|
|
|
if args.output:
|
|
with open(args.output, mode="w") as output_file:
|
|
json.dump(data, output_file, **json_dump_kwargs)
|
|
else:
|
|
print(json.dumps(data, **json_dump_kwargs))
|
|
|
|
# vi:ft=python:ts=4
|