#!/usr/bin/python

INPUTDIRS=("%s"%i for i in range(2000,2020))
#URLPATH="http://www.consistent.org/terran"
URLPATH="/terran"
ABSOLUTEURLPREFIX="http://www.consistent.org"

import glob
from re import (findall, sub, DOTALL, IGNORECASE)
import operator
import time

def htmlheaders(title):
    return( """\
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> 
<html>
<head>
<title>%s</title>
</head>
<body
bgcolor="#000000"
text="#80c0ff"
link="#40f0c0"
vlink="#00b080"
alink="#f0f040"
>
<h1 align=center>%s</h1>
""" % (sub('<.*?>', '', title), title),
    """
<hr>
<!--#include virtual="/mfooter.shtml" -->
</body>
</html>
    """
    )

def atomheaders(**kwargs):
    """ required keyword arguments are:
            title
            url
            author

        The same url is used both as the self link and as the ID.
    """

    gmtime = time.gmtime()
    kwargs['time3339'] = "%04d-%02d-%02dT%02d:%02d:%02dZ" % (
        gmtime[0], gmtime[1], gmtime[2], gmtime[3], gmtime[4], gmtime[5])

    return ("""\
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
    <link rel="self" href="%(url)s" />
    <link rel="alternate" href="%(alturl)s" />

    <title>%(title)s</title>
    <updated>%(time3339)s</updated>
    <author><name>%(author)s</name></author>
    <id>%(url)s</id>
    """ % kwargs, "</feed>")


def parenifnotblank(string):
    if string is None or len(string)==0:
	return string
    return "(%s)" % string

# Reference: http://www.atomenabled.org/developers/syndication/
# This is common to all feeds we write:
atomformatstring="""\
    <entry>
        <title>%(title)s</title>
        <link rel="alternate" href="%(absoluteurl)s" />
        <id>%(absoluteurl)s</id>
        <updated>%(date3339)s</updated>
        <summary>%(summary)s</summary>
    </entry>"""

atomextralinkformatstring="""\
    <entry>
        <title>%(title)s</title>
        <link rel="alternate" href="%(absoluteurl)s" />
        <id>%(absoluteurl)s</id>
        <updated>%(date3339)s</updated>
        <summary><a href="%(absoluteurl)s">%(summary)s</a></summary>
    </entry>"""

##########################################################################

def readfiles(defaultdict):
    files=[]
    global property
    property={}
    for dir in INPUTDIRS:
        files.extend(glob.glob(dir + "/*html"))

    for filename in files:
        file = open(filename, "r")
        data = file.read()

        taglist=findall('''<meta[^>]*name="(.*?)"[^>]*content="(.*?)"[^>]*>''', data, DOTALL | IGNORECASE);

        property[filename]=defaultdict.copy()
        
        foundtitle=findall('''<title>([^<]*)</title>''',data, DOTALL | IGNORECASE)
        if (len(foundtitle)==0):
            property[filename]["title"]="Untitled"
        else:
            property[filename]["title"]=foundtitle[0]

        tags={}
        for tag in taglist:
            property[filename][tag[0].lower()]=tag[1]

        # This is convenient later, when we wish to iterate over the
        # values without passing the filename keys separately:
        property[filename]['filename']=filename
        property[filename].setdefault('url', filename)

        # Atom requires an annoying time format.  Times are always zero,
        # because I store only dates.
        try:
            (year, month, day)=findall('(\d\d\d\d)(\d\d)(\d\d)', property[filename]['date'])[0]

            # Atom validator complains about missing month or day:
            assert (int(month)>0 and int(day)>0)

            property[filename]['date3339'] = \
                    "%04s-%02s-%02sT00:00:00Z" % (year, month, day)
        except:
            # Continue without a date/time in RFC3339 format.  We'll avoid
            # printing this entry under certain circumstances.
            pass

        # Add prefix to urls without a leading slash or leading http
        if (property[filename]['url'][0] != '/' and
            property[filename]['url'][0:4].lower() != 'http'):
            property[filename]['url'] = URLPATH + '/' + property[filename]['url']

        if property[filename]['url'][0:4].lower() != 'http':
            property[filename]['absoluteurl'] = \
            ABSOLUTEURLPREFIX + property[filename]['url']
        else:
            property[filename]['absoluteurl'] = property[filename]['url']

        # Add a separate year
        if (property[filename]['date']):
            property[filename]['year'] = property[filename]['date'][0:4]

        # Remove newlines in summary for LJ
        try:
            property[filename]['summary'] = \
                sub('\n', ' ', property[filename]['summary'])
        except:
            pass

        file.close()

def cmpdate(a,b):
    return cmp(property[a].get("date", ""), property[b].get("date", ""))

def writefile(filename, headfoot, maxcount, selectfunc, *varargs):
    """ Our arguments are:

        filename to write

        A 2-tuple containing strings with the header and footer.  A single
            None is accepted in place of ('', '') if you want nothing.

        a function to select records, or None for all records (None is a
            special case to save you writing a bunch of lambdas that just
            return true)

        An integer representing the maximum number of entries, or a False
            value for no limit

        0 or more 2-tuples.  The first is the named parameter to group on, and
            the second is a format string using named variables for
            printing out the group heading when the value changes.
            For example, you might provide two, one on the "category"
            field and the next on "subcategory".  If the format string is
            None, sorting by this parameter will be done, but no headings
            will be printed (e.g. to sort by date, but not print a heading
            for each different day).  If a 3-tuple is given instead, the
            last item is a sort function - desirable, for example, to
            put most recent dates first by reversing the usual order.

        Exactly one format string for individual items, as above.

    The category string for formats get one argument, the new value for
    the category in question, which should be substituted into the single
    %s which should appear in the format string.  The final format string
    for the final items gets passed a dict of all the item's properties,
    which you use with %(name)s style substitutions.

    In all functions, you are passed one argument, a dict of the properties
    of the file in question.  These are all whatever appears in the
    meta tags plus the following special cases:

        title
        filename
        url (constructed from filename and base if not given explicitly)

    This is complicated, but it is general.  For a more
    intuitive understanding of what it does, look at the examples.
    """

    out=open(filename, "w")

    if (headfoot):
        print >>out, headfoot[0]

    # Filter by our function
    toprint = property.keys()
    if (selectfunc):
        toprint=filter(lambda y: selectfunc(property[y]), toprint)

    # Enclose this in a list to work around Python's somewhat
    # counterintuitive mutability and scoping
    total=[0]

    def recursecategory(items, varargs):
        cat = varargs[0]

        # In the last case, iterate and print the format string.
        if len(varargs)==1:
            for item in items:
                print >>out, cat % item
                total[0]=total[0]+1
                if maxcount and total[0] >= maxcount:
                    raise 'done'
        else:
            sortfunc = len(cat)> 2 and cat[2] or cmp
            for val in sorted(set(i[cat[0]] for i in items), cmp=sortfunc):
                if cat[1]:
                    print >>out, cat[1] % val
                recursecategory([i for i in items if i[cat[0]]==val], 
                        varargs[1:])


    try:
        recursecategory([property[x] for x in toprint], varargs)
    except 'done':
        pass

    if (headfoot):
        print >>out, headfoot[1]

    out.close()

readfiles({'summary':'', 'category':'', 'subcategory':'', 'date':'00000000'})

for category in ("Technical", "Personal"):
    writefile("indices/%s.shtml" % category.lower(),
        htmlheaders('<a href="/terran/">Terran Melconian</a>: %s' % category),
        None,
        lambda y: y['category'] == category,
        ('subcategory', "<h2>%s</h2>"),
        ('year', "<h3>%s</h3>",lambda x,y: cmp(y,x)),
        ('date', None, lambda x,y: cmp(y,x)),
        '<dl><dt><a href="%(url)s"><b>%(title)s</b> (%(date)s)</a><dd>%(summary)s</dl>')

    writefile("indices/%s.inc" % category.lower(),
        ('', '<a href="/terran/indices/%s.shtml"><em>Index of Older %s</em></a>' % (category.lower(), category)),
        5, # limit 5 items
        lambda y: y['category'] == category,
        ('category', '<h2 align="center">%s</h2>'),
        ('date', None, lambda x,y: cmp(y,x)),
        '<dl><dt><a href="%(url)s"><b>%(title)s</b> (%(date)s)</a><dd>%(summary)s</dl>')

writefile("indices/date.shtml",
        htmlheaders('<a href="/terran/">Terran Melconian</a>: Index by Date'),
        None, # No limit count
        lambda y: int(y['date']), # Must have a nonzero date
        ('year', "<h2>%s</h2>",lambda x,y: cmp(y,x)),
        ('date', None, lambda x,y: cmp(y,x)),
        '<dl><dt><a href="%(url)s"><b>%(title)s</b> (%(date)s/%(category)s)</a><dd>%(summary)s</dl>')

writefile("indices/title.shtml",
        htmlheaders('<a href="/terran/">Terran Melconian</a>: Index by Title'),
        None, # No limit count
        lambda y: y['category'], # must have a category
        ('title', None),
        '<dl><dt><a href="%(url)s"><b>%(title)s</b> (%(date)s/%(category)s)</a><dd>%(summary)s</dl>')

writefile("indices/category.shtml",
        htmlheaders('<a href="/terran/">Terran Melconian</a>: Index by Category'),
        None, # No limit count
        lambda y: y['category'], # must have a category
        ('category', "<h2>%s</h2>"),
        ('subcategory', "<h3>%s</h3>"),
        ('date', None, lambda x,y: cmp(y,x)),
        '<dl><dt><a href="%(url)s"><b>%(title)s</b> (%(date)s)</a><dd>%(summary)s</dl>')

writefile("indices/notindexed.shtml",
        htmlheaders('For Internal Use Only: Unindexed Pages (Used to Find Typos)'),
        None, # No limit count
        lambda y: not y['category'] or not int(y['date']), # must NOT have info
        ('date', None, lambda x,y: cmp(y,x)),
        '<dl><dt><a href="%(url)s"><b>%(title)s</b> (%(date)s)</a><dd>%(summary)s</dl>')

writefile("indices/atom.xml",
        atomheaders(url="http://www.consistent.org/terran/indices/atom.xml",
            author="Terran Melconian", title="Terran Melconian: Live on the Interblarg!",
            alturl="http://www.consistent.org/terran/"),
        5, # Limit count 5
        lambda y: y.has_key('date3339'), # Must have acceptable timestamp
        ('date', None, lambda x,y: cmp(y,x)), # Most recent first
        atomformatstring
        )

writefile("indices/atom-facebook.xml",
        atomheaders(url="http://www.consistent.org/terran/indices/atom.xml",
            author="Terran Melconian", title="Terran Melconian: Live on the Interblarg!",
            alturl="http://www.consistent.org/terran/"),
        5, # Limit count 5
        lambda y: y.has_key('date3339'), # Must have acceptable timestamp
        ('date', None, lambda x,y: cmp(y,x)), # Most recent first
        atomextralinkformatstring
        )

for category in ("Technical", "Personal"):
    writefile("indices/atom_%s.xml" % category.lower(),
        atomheaders(url="http://www.consistent.org/terran/indices/atom.xml",
            author="Terran Melconian", title="Terran Melconian: category %s" % category,
            alturl="http://www.consistent.org/terran/"),
        5, # Limit count 5
        lambda y: y.has_key('date3339') and y['category'] == category,
        ('date', None, lambda x,y: cmp(y,x)), # Most recent first
        atomformatstring
        )

# vi: set nowrap:
