#!/usr/bin/python INPUTDIRS=("%s"%i for i in range(2000,2020)) #URLPATH="http://www.consistent.org/terran" URLPATH="/terran" ABSOLUTEURLPREFIX="http://www.consistent.org" import glob from re import (findall, sub, DOTALL, IGNORECASE) import operator import time def htmlheaders(title): return( """\ %s

%s

""" % (sub('<.*?>', '', title), title), """

""" ) def atomheaders(**kwargs): """ required keyword arguments are: title url author The same url is used both as the self link and as the ID. """ gmtime = time.gmtime() kwargs['time3339'] = "%04d-%02d-%02dT%02d:%02d:%02dZ" % ( gmtime[0], gmtime[1], gmtime[2], gmtime[3], gmtime[4], gmtime[5]) return ("""\ %(title)s %(time3339)s %(author)s %(url)s """ % kwargs, "") def parenifnotblank(string): if string is None or len(string)==0: return string return "(%s)" % string # Reference: http://www.atomenabled.org/developers/syndication/ # This is common to all feeds we write: atomformatstring="""\ %(title)s %(absoluteurl)s %(date3339)s

%(summary)s

""" atomextralinkformatstring="""\ %(title)s %(absoluteurl)s %(date3339)s

%(summary)s

""" ########################################################################## def readfiles(defaultdict): files=[] global property property={} for dir in INPUTDIRS: files.extend(glob.glob(dir + "/*html")) for filename in files: file = open(filename, "r") data = file.read() taglist=findall(''']*name="(.*?)"[^>]*content="(.*?)"[^>]*>''', data, DOTALL | IGNORECASE); property[filename]=defaultdict.copy() foundtitle=findall('''([^<]*)''',data, DOTALL | IGNORECASE) if (len(foundtitle)==0): property[filename]["title"]="Untitled" else: property[filename]["title"]=foundtitle[0] tags={} for tag in taglist: property[filename][tag[0].lower()]=tag[1] # This is convenient later, when we wish to iterate over the # values without passing the filename keys separately: property[filename]['filename']=filename property[filename].setdefault('url', filename) # Atom requires an annoying time format. Times are always zero, # because I store only dates. try: (year, month, day)=findall('(\d\d\d\d)(\d\d)(\d\d)', property[filename]['date'])[0] # Atom validator complains about missing month or day: assert (int(month)>0 and int(day)>0) property[filename]['date3339'] = \ "%04s-%02s-%02sT00:00:00Z" % (year, month, day) except: # Continue without a date/time in RFC3339 format. We'll avoid # printing this entry under certain circumstances. pass # Add prefix to urls without a leading slash or leading http if (property[filename]['url'][0] != '/' and property[filename]['url'][0:4].lower() != 'http'): property[filename]['url'] = URLPATH + '/' + property[filename]['url'] if property[filename]['url'][0:4].lower() != 'http': property[filename]['absoluteurl'] = \ ABSOLUTEURLPREFIX + property[filename]['url'] else: property[filename]['absoluteurl'] = property[filename]['url'] # Add a separate year if (property[filename]['date']): property[filename]['year'] = property[filename]['date'][0:4] # Remove newlines in summary for LJ try: property[filename]['summary'] = \ sub('\n', ' ', property[filename]['summary']) except: pass file.close() def cmpdate(a,b): return cmp(property[a].get("date", ""), property[b].get("date", "")) def writefile(filename, headfoot, maxcount, selectfunc, *varargs): """ Our arguments are: filename to write A 2-tuple containing strings with the header and footer. A single None is accepted in place of ('', '') if you want nothing. a function to select records, or None for all records (None is a special case to save you writing a bunch of lambdas that just return true) An integer representing the maximum number of entries, or a False value for no limit 0 or more 2-tuples. The first is the named parameter to group on, and the second is a format string using named variables for printing out the group heading when the value changes. For example, you might provide two, one on the "category" field and the next on "subcategory". If the format string is None, sorting by this parameter will be done, but no headings will be printed (e.g. to sort by date, but not print a heading for each different day). If a 3-tuple is given instead, the last item is a sort function - desirable, for example, to put most recent dates first by reversing the usual order. Exactly one format string for individual items, as above. The category string for formats get one argument, the new value for the category in question, which should be substituted into the single %s which should appear in the format string. The final format string for the final items gets passed a dict of all the item's properties, which you use with %(name)s style substitutions. In all functions, you are passed one argument, a dict of the properties of the file in question. These are all whatever appears in the meta tags plus the following special cases: title filename url (constructed from filename and base if not given explicitly) This is complicated, but it is general. For a more intuitive understanding of what it does, look at the examples. """ out=open(filename, "w") if (headfoot): print >>out, headfoot[0] # Filter by our function toprint = property.keys() if (selectfunc): toprint=filter(lambda y: selectfunc(property[y]), toprint) # Enclose this in a list to work around Python's somewhat # counterintuitive mutability and scoping total=[0] def recursecategory(items, varargs): cat = varargs[0] # In the last case, iterate and print the format string. if len(varargs)==1: for item in items: print >>out, cat % item total[0]=total[0]+1 if maxcount and total[0] >= maxcount: return True else: sortfunc = len(cat)> 2 and cat[2] or cmp for val in sorted(set(i[cat[0]] for i in items), cmp=sortfunc): if cat[1]: print >>out, cat[1] % val if recursecategory([i for i in items if i[cat[0]]==val], varargs[1:]): return True recursecategory([property[x] for x in toprint], varargs) if (headfoot): print >>out, headfoot[1] out.close() readfiles({'summary':'', 'category':'', 'subcategory':'', 'date':'00000000'}) for category in ("Technical", "Personal"): writefile("indices/%s.shtml" % category.lower(), htmlheaders('Terran Melconian: %s' % category), None, lambda y: y['category'] == category, ('subcategory', "

%s

"), ('year', "

%s

",lambda x,y: cmp(y,x)), ('date', None, lambda x,y: cmp(y,x)), '

%(title)s (%(date)s): %(summary)s

') writefile("indices/%s.inc" % category.lower(), ('', 'Index of Older %s' % (category.lower(), category)), 5, # limit 5 items lambda y: y['category'] == category, ('category', '

%s

'), ('date', None, lambda x,y: cmp(y,x)), '

%(title)s (%(date)s): %(summary)s

') writefile("indices/date.shtml", htmlheaders('Terran Melconian: Index by Date'), None, # No limit count lambda y: int(y['date']), # Must have a nonzero date ('year', "

%s

",lambda x,y: cmp(y,x)), ('date', None, lambda x,y: cmp(y,x)), '

%(title)s (%(date)s/%(category)s): %(summary)s

') writefile("indices/title.shtml", htmlheaders('Terran Melconian: Index by Title'), None, # No limit count lambda y: y['category'], # must have a category ('title', None), '

%(title)s (%(date)s/%(category)s): %(summary)s

') writefile("indices/category.shtml", htmlheaders('Terran Melconian: Index by Category'), None, # No limit count lambda y: y['category'], # must have a category ('category', "

%s

"), ('subcategory', "

%s

"), ('date', None, lambda x,y: cmp(y,x)), '

%(title)s (%(date)s): %(summary)s

') writefile("indices/notindexed.shtml", htmlheaders('For Internal Use Only: Unindexed Pages (Used to Find Typos)'), None, # No limit count lambda y: not y['category'] or not int(y['date']), # must NOT have info ('date', None, lambda x,y: cmp(y,x)), '

%(title)s (%(date)s): %(summary)s

') writefile("indices/atom.xml", atomheaders(url="http://www.consistent.org/terran/indices/atom.xml", author="Terran Melconian", title="Terran Melconian: Live on the Interblarg!", alturl="http://www.consistent.org/terran/"), 5, # Limit count 5 lambda y: y.has_key('date3339'), # Must have acceptable timestamp ('date', None, lambda x,y: cmp(y,x)), # Most recent first atomformatstring ) writefile("indices/atom-facebook.xml", atomheaders(url="http://www.consistent.org/terran/indices/atom.xml", author="Terran Melconian", title="Terran Melconian: Live on the Interblarg!", alturl="http://www.consistent.org/terran/"), 5, # Limit count 5 lambda y: y.has_key('date3339'), # Must have acceptable timestamp ('date', None, lambda x,y: cmp(y,x)), # Most recent first atomextralinkformatstring ) for category in ("Technical", "Personal"): writefile("indices/atom_%s.xml" % category.lower(), atomheaders(url="http://www.consistent.org/terran/indices/atom.xml", author="Terran Melconian", title="Terran Melconian: category %s" % category, alturl="http://www.consistent.org/terran/"), 5, # Limit count 5 lambda y: y.has_key('date3339') and y['category'] == category, ('date', None, lambda x,y: cmp(y,x)), # Most recent first atomformatstring ) # vi: set nowrap: