mkrss

#!/usr/bin/python3
"""Make RSS for file list.

It requires rss.py by Fuktommy.

Synopsis:
    mkrss.py [/path/to/html/dir] > rss.xml
    mkrss.py -b /path/to/html/dir file_list > rss.xml
    find /path/to/html/dir -type f | \
        mkrss.py -b /path/to/html/dir > rss.xml

Options:
    -h header_file: File includes title for RSS.

It generates RSS for file, if file.txt exist or file is a HTML.
If file is HTML, it use <title> tag and <meta name="description"...> tag
and use Git log comment for timestamp.

Header file:
    version: 1.0/2.0        # version of RSS
    parent: URI             # URI of parent of documents
    uri: URI                # URI of RSS
    link: URI               # URI of main page
    title: string           # title of RSS
    description: string     # description of RSS

Text file:
    Top line is the title.
    Following line is description.
"""

#
# Copyright (c) 2005-2019 Satoshi Fukutomi <info@fuktommy.com>.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#

import fileinput
import os
import subprocess
import sys
import re
import time
from html.parser import HTMLParser

import rss      # module written by Fuktommy

#
# Configuration
#

rss_version = "1.0"
encode = "utf-8"
parent_uri = "http://example.com/files/"
link = parent_uri
rss_uri = parent_uri + "rss.xml"
title = "Files"
description = "My Files"
xsl = ""
header_file = ""

os.environ['TZ'] = 'UTC'
time.tzset()

def read_header(f):
    """Read header file.

    version: 1.0/2.0        # version of RSS
    parent: URI             # URI of parent of documents
    uri: URI                # URI of RSS
    link: URI               # URI of main page
    title: string           # title of RSS
    description: string     # description of RSS
    xsl: path               # PATH or URI of XSL
    """

    global rss_version, parent_uri, rss_uri, link, title, description, xsl
    re_iscomment = re.compile(r"^\s*#")
    re_keyval = re.compile(r"\s*:\s*")
    del_space = re.compile(r"^\s*")
    del_eof = re.compile(r'[\r\n]*')
    last = ""
    conf = {}
    for line in open(f):
        if line == "\n":
            continue
        elif re_iscomment.search(line):
            continue
        line = del_space.sub("", line)
        line = del_eof.sub("", line)
        try:
            (key, val) = re_keyval.split(line, 1)
            conf[key]  = val
            last = key
        except ValueError:
            conf[last] += line
    for key in conf:
        if key == "version":
            rss_version =conf[key]
        elif key == "parent":
            parent_uri = conf[key]
        elif key == "uri":
            rss_uri = conf[key]
        elif key == "link":
            link = conf[key]
        elif key == "title":
            title = conf[key]
        elif key == "description":
            description = conf[key]
        elif key == "xsl":
            xsl = conf[key]

def read_text(f):
    """Read from text file.

    Top line is the title.
    Following line is description.
    """

    txt = open(f + ".txt")
    title = txt.readline()
    desc = ''
    for line in txt:
        desc += line
    txt.close()
    return (title, desc)

class HtmlReader(HTMLParser):
    """Read from HTML.

    Use <title> tag and <meta name="description"...> tag.
    """

    attrs = {}
    now = ""
    encode = ""
    re_xmlencode = re.compile(r"xml.*encoding=.([^<>\"']+)..*\?$")
    re_htmlencode = re.compile(r"charset=([^<>\"'/]+)")
    del_space = re.compile(r"^\s*|\s*$")

    def __init__(self):
        """Constructor."""

        HTMLParser.__init__(self)
        self.attrs = {"title": "", "description": ""}
        self.now = ""
        self.encode = encode

    def dic_attr(self, attrs):
        """Convert attrs list to dictionary."""

        da = {}
        for i in attrs:
            da[i[0]] = i[1]
        return da

    def handle_starttag(self, tag, attrs):
        """Overwrite method."""

        attrs = self.dic_attr(attrs)
        if tag == "title":
            self.now = "title"
        elif tag == "meta" \
            and "name" in attrs \
            and "content" in attrs \
            and attrs["name"] == "description":
            self.attrs["description"] = attrs["content"]
        elif tag == "meta" \
            and "http-equiv" in attrs \
            and "content" in attrs\
            and attrs["http-equiv"].lower() == "content-type":
            htmlencode = self.re_htmlencode.search(attrs["content"])
            self.encode = htmlencode.group(1)
        else:
            self.now = ""

    def handle_data(self, data):
        """Overwrite method."""
        if self.now:
            self.attrs[self.now] += data

    def handle_pi(self, data):
        """Overwrite method."""
        xmlencode = self.re_xmlencode.search(data)
        if xmlencode:
            self.encode = xmlencode.group(1)

    def getData(self):
        """Return title and description."""

        for k in ("title", "description"):
            data = self.attrs[k]
            self.attrs[k] = self.del_space.sub("", data, 0)
        return (self.attrs["title"], self.attrs["description"])

def read_html(f):
    """Read from HTML.

    Wrapper of HtmlReader.
    """
    html = open(f)
    parser = HtmlReader()
    parser.feed(html.read())
    (title, description) = parser.getData()
    parser.close()
    return(title, description)

def findfiles(dir):
    """Search directorys for files.

    It works lile ``find dir -type f''.
    """

    files = []
    buf = []
    for f in os.listdir(dir):
        buf.append(dir + "/" + f)

    while len(buf) > 0:
        f = buf.pop(0)
        if os.path.islink(f):
            pass
        elif os.path.isdir(f):
            for g in os.listdir(f):
                buf.append(f + "/" + g)
        elif os.path.isfile(f):
            files.append(f)

    return files

def get_date(filename): 
    command = ['git', 'log', '-n', '1', '--pretty=format:%at', '--', filename]
    pipe = subprocess.Popen(command, stdout=subprocess.PIPE)
    try:
        unixtime = int(pipe.stdout.read())
    except ValueError:
        unixtime = os.path.getmtime(filename)
    pipe.wait()
    return unixtime

#
# main
#

#
# make list of files
#

dir = ""
files = []
lists = []
mode = "self_find"    # self_find / given_list
sys.argv.pop(0)
while (sys.argv):
    i = sys.argv.pop(0)
    if i == "-b":
        mode = "given_list"
        dir  = sys.argv.pop(0)
    elif i == "-h":
        read_header(sys.argv.pop(0))
    else:
        lists.append(i)

if mode == "self_find" and len(lists) == 1:
    dir = lists[0]
    files = findfiles(dir)
elif mode == "self_find" and len(lists) == 0:
    dir = "."
    files = findfiles(dir)
elif mode == "given_list":
    for line in fileinput.input():
        files.append(*line.splitlines())

#
# Generate RSS
#

list = rss.RSS(encode=encode, title=title,
        parent=parent_uri, uri=rss_uri, link=link,
        description=description, xsl=xsl)

ishtml = re.compile(r"\.html$")
del_index = re.compile(r"index\.html$")
for f in files:
    date = None
    title = ""
    desc = ""
    date = get_date(f)

    if os.path.isfile(f + ".txt"):
        title, desc = read_text(f)
    elif ishtml.search(f):
        title, desc = read_html(f)
        f = del_index.sub("", f)
        f = ishtml.sub("", f)
    if title:
        f = f[len(dir)+1:]
        list.append(f, title=title, date=date, description=desc)

if rss_version == "1.0":
    sys.stdout.write(rss.make_rss1(list))
elif rss_version == "2.0":
    sys.stdout.write(rss.make_rss2(list))