hgbook

view ja/fixhtml.py @ 856:22cce9e3c445

Validation typos.
author gpiancastelli
date Sat Aug 22 22:14:55 2009 +0200 (2009-08-22)
parents
children
line source
1 #!/usr/bin/env python
2 #
3 # This script attempts to work around some of the more bizarre and
4 # quirky behaviours of htlatex.
5 #
6 # - We've persuaded htlatex to produce UTF-8, which unfortunately
7 # causes it to use huge character sequences to represent even the
8 # safe 7-bit ASCII subset of UTF-8. We fix that up.
9 #
10 # - BUT we have to treat angle brackets (for example, redirections in
11 # shell script snippets) specially, otherwise they'll break the
12 # generated HTML. (Reported by Johannes Hoff.)
13 #
14 # - For some reason, htlatex gives a unique ID to each fancyvrb
15 # environment, which makes writing a sane, small CSS stylesheet
16 # impossible. We squish all those IDs down to nothing.
18 import os
19 import sys
20 import re
22 angle_re = re.compile(r'(&#x003[CE];)')
23 unicode_re = re.compile(r'&#x00([0-7][0-9A-F]);')
24 fancyvrb_re = re.compile(r'id="fancyvrb\d+"', re.I)
25 ligature_re = re.compile(r'&#xFB0([0-4]);')
27 tmpsuffix = '.tmp.' + str(os.getpid())
29 def hide_angle(m):
30 return m.group(1).lower()
32 def fix_ascii(m):
33 return chr(int(m.group(1), 16))
35 ligatures = ['ff', 'fi', 'fl', 'ffi', 'ffl']
37 def expand_ligature(m):
38 return ligatures[int(m.group(1))]
40 for name in sys.argv[1:]:
41 tmpname = name + tmpsuffix
42 ofp = file(tmpname, 'w')
43 for line in file(name):
44 line = angle_re.sub(hide_angle, line)
45 line = unicode_re.sub(fix_ascii, line)
46 line = ligature_re.sub(expand_ligature, line)
47 line = fancyvrb_re.sub('id="fancyvrb"', line)
48 ofp.write(line)
49 ofp.close()
50 os.rename(tmpname, name)