#!/usr/bin/python
INPUTDIRS=("%s"%i for i in range(2000,2020))
#URLPATH="http://www.consistent.org/terran"
URLPATH="/terran"
ABSOLUTEURLPREFIX="http://www.consistent.org"
import glob
from re import (findall, sub, DOTALL, IGNORECASE)
import operator
import time
def htmlheaders(title):
return( """\
%s
%s
""" % (sub('<.*?>', '', title), title),
"""
"""
)
def atomheaders(**kwargs):
""" required keyword arguments are:
title
url
author
The same url is used both as the self link and as the ID.
"""
gmtime = time.gmtime()
kwargs['time3339'] = "%04d-%02d-%02dT%02d:%02d:%02dZ" % (
gmtime[0], gmtime[1], gmtime[2], gmtime[3], gmtime[4], gmtime[5])
return ("""\
%(title)s%(time3339)s%(author)s%(url)s
""" % kwargs, "")
def parenifnotblank(string):
if string is None or len(string)==0:
return string
return "(%s)" % string
# Reference: http://www.atomenabled.org/developers/syndication/
# This is common to all feeds we write:
atomformatstring="""\
%(title)s%(absoluteurl)s%(date3339)s%(summary)s"""
atomextralinkformatstring="""\
%(title)s%(absoluteurl)s%(date3339)s%(summary)s"""
##########################################################################
def readfiles(defaultdict):
files=[]
global property
property={}
for dir in INPUTDIRS:
files.extend(glob.glob(dir + "/*html"))
for filename in files:
file = open(filename, "r")
data = file.read()
taglist=findall(''']*name="(.*?)"[^>]*content="(.*?)"[^>]*>''', data, DOTALL | IGNORECASE);
property[filename]=defaultdict.copy()
foundtitle=findall('''([^<]*)''',data, DOTALL | IGNORECASE)
if (len(foundtitle)==0):
property[filename]["title"]="Untitled"
else:
property[filename]["title"]=foundtitle[0]
tags={}
for tag in taglist:
property[filename][tag[0].lower()]=tag[1]
# This is convenient later, when we wish to iterate over the
# values without passing the filename keys separately:
property[filename]['filename']=filename
property[filename].setdefault('url', filename)
# Atom requires an annoying time format. Times are always zero,
# because I store only dates.
try:
(year, month, day)=findall('(\d\d\d\d)(\d\d)(\d\d)', property[filename]['date'])[0]
# Atom validator complains about missing month or day:
assert (int(month)>0 and int(day)>0)
property[filename]['date3339'] = \
"%04s-%02s-%02sT00:00:00Z" % (year, month, day)
except:
# Continue without a date/time in RFC3339 format. We'll avoid
# printing this entry under certain circumstances.
pass
# Add prefix to urls without a leading slash or leading http
if (property[filename]['url'][0] != '/' and
property[filename]['url'][0:4].lower() != 'http'):
property[filename]['url'] = URLPATH + '/' + property[filename]['url']
if property[filename]['url'][0:4].lower() != 'http':
property[filename]['absoluteurl'] = \
ABSOLUTEURLPREFIX + property[filename]['url']
else:
property[filename]['absoluteurl'] = property[filename]['url']
# Add a separate year
if (property[filename]['date']):
property[filename]['year'] = property[filename]['date'][0:4]
# Remove newlines in summary for LJ
try:
property[filename]['summary'] = \
sub('\n', ' ', property[filename]['summary'])
except:
pass
file.close()
def cmpdate(a,b):
return cmp(property[a].get("date", ""), property[b].get("date", ""))
def writefile(filename, headfoot, maxcount, selectfunc, *varargs):
""" Our arguments are:
filename to write
A 2-tuple containing strings with the header and footer. A single
None is accepted in place of ('', '') if you want nothing.
a function to select records, or None for all records (None is a
special case to save you writing a bunch of lambdas that just
return true)
An integer representing the maximum number of entries, or a False
value for no limit
0 or more 2-tuples. The first is the named parameter to group on, and
the second is a format string using named variables for
printing out the group heading when the value changes.
For example, you might provide two, one on the "category"
field and the next on "subcategory". If the format string is
None, sorting by this parameter will be done, but no headings
will be printed (e.g. to sort by date, but not print a heading
for each different day). If a 3-tuple is given instead, the
last item is a sort function - desirable, for example, to
put most recent dates first by reversing the usual order.
Exactly one format string for individual items, as above.
The category string for formats get one argument, the new value for
the category in question, which should be substituted into the single
%s which should appear in the format string. The final format string
for the final items gets passed a dict of all the item's properties,
which you use with %(name)s style substitutions.
In all functions, you are passed one argument, a dict of the properties
of the file in question. These are all whatever appears in the
meta tags plus the following special cases:
title
filename
url (constructed from filename and base if not given explicitly)
This is complicated, but it is general. For a more
intuitive understanding of what it does, look at the examples.
"""
out=open(filename, "w")
if (headfoot):
print >>out, headfoot[0]
# Filter by our function
toprint = property.keys()
if (selectfunc):
toprint=filter(lambda y: selectfunc(property[y]), toprint)
# Enclose this in a list to work around Python's somewhat
# counterintuitive mutability and scoping
total=[0]
def recursecategory(items, varargs):
cat = varargs[0]
# In the last case, iterate and print the format string.
if len(varargs)==1:
for item in items:
print >>out, cat % item
total[0]=total[0]+1
if maxcount and total[0] >= maxcount:
return True
else:
sortfunc = len(cat)> 2 and cat[2] or cmp
for val in sorted(set(i[cat[0]] for i in items), cmp=sortfunc):
if cat[1]:
print >>out, cat[1] % val
if recursecategory([i for i in items if i[cat[0]]==val],
varargs[1:]):
return True
recursecategory([property[x] for x in toprint], varargs)
if (headfoot):
print >>out, headfoot[1]
out.close()
readfiles({'summary':'', 'category':'', 'subcategory':'', 'date':'00000000'})
for category in ("Technical", "Personal"):
writefile("indices/%s.shtml" % category.lower(),
htmlheaders('Terran Melconian: %s' % category),
None,
lambda y: y['category'] == category,
('subcategory', "
')
writefile("indices/date.shtml",
htmlheaders('Terran Melconian: Index by Date'),
None, # No limit count
lambda y: int(y['date']), # Must have a nonzero date
('year', "
')
writefile("indices/title.shtml",
htmlheaders('Terran Melconian: Index by Title'),
None, # No limit count
lambda y: y['category'], # must have a category
('title', None),
'
')
writefile("indices/category.shtml",
htmlheaders('Terran Melconian: Index by Category'),
None, # No limit count
lambda y: y['category'], # must have a category
('category', "
')
writefile("indices/notindexed.shtml",
htmlheaders('For Internal Use Only: Unindexed Pages (Used to Find Typos)'),
None, # No limit count
lambda y: not y['category'] or not int(y['date']), # must NOT have info
('date', None, lambda x,y: cmp(y,x)),
'