hgbook

annotate ja/fixhtml.py @ 862:ad6d3f5245e7

Link back to the original English version of the book.
author gpiancastelli
date Fri Aug 28 12:21:45 2009 +0200 (2009-08-28)
parents
children
rev   line source
foozy@708 1 #!/usr/bin/env python
foozy@708 2 #
foozy@708 3 # This script attempts to work around some of the more bizarre and
foozy@708 4 # quirky behaviours of htlatex.
foozy@708 5 #
foozy@708 6 # - We've persuaded htlatex to produce UTF-8, which unfortunately
foozy@708 7 # causes it to use huge character sequences to represent even the
foozy@708 8 # safe 7-bit ASCII subset of UTF-8. We fix that up.
foozy@708 9 #
foozy@708 10 # - BUT we have to treat angle brackets (for example, redirections in
foozy@708 11 # shell script snippets) specially, otherwise they'll break the
foozy@708 12 # generated HTML. (Reported by Johannes Hoff.)
foozy@708 13 #
foozy@708 14 # - For some reason, htlatex gives a unique ID to each fancyvrb
foozy@708 15 # environment, which makes writing a sane, small CSS stylesheet
foozy@708 16 # impossible. We squish all those IDs down to nothing.
foozy@708 17
foozy@708 18 import os
foozy@708 19 import sys
foozy@708 20 import re
foozy@708 21
foozy@708 22 angle_re = re.compile(r'(&#x003[CE];)')
foozy@708 23 unicode_re = re.compile(r'&#x00([0-7][0-9A-F]);')
foozy@708 24 fancyvrb_re = re.compile(r'id="fancyvrb\d+"', re.I)
foozy@708 25 ligature_re = re.compile(r'&#xFB0([0-4]);')
foozy@708 26
foozy@708 27 tmpsuffix = '.tmp.' + str(os.getpid())
foozy@708 28
foozy@708 29 def hide_angle(m):
foozy@708 30 return m.group(1).lower()
foozy@708 31
foozy@708 32 def fix_ascii(m):
foozy@708 33 return chr(int(m.group(1), 16))
foozy@708 34
foozy@708 35 ligatures = ['ff', 'fi', 'fl', 'ffi', 'ffl']
foozy@708 36
foozy@708 37 def expand_ligature(m):
foozy@708 38 return ligatures[int(m.group(1))]
foozy@708 39
foozy@708 40 for name in sys.argv[1:]:
foozy@708 41 tmpname = name + tmpsuffix
foozy@708 42 ofp = file(tmpname, 'w')
foozy@708 43 for line in file(name):
foozy@708 44 line = angle_re.sub(hide_angle, line)
foozy@708 45 line = unicode_re.sub(fix_ascii, line)
foozy@708 46 line = ligature_re.sub(expand_ligature, line)
foozy@708 47 line = fancyvrb_re.sub('id="fancyvrb"', line)
foozy@708 48 ofp.write(line)
foozy@708 49 ofp.close()
foozy@708 50 os.rename(tmpname, name)