hgbook
changeset 251:2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Finally write up what fixhtml.py is actually doing.
Finally write up what fixhtml.py is actually doing.
author | Bryan O'Sullivan <bos@serpentine.com> |
---|---|
date | Wed May 30 21:50:21 2007 -0700 (2007-05-30) |
parents | 5ecf66974def |
children | f2061ece8ed9 |
files | en/fixhtml.py |
line diff
1.1 --- a/en/fixhtml.py Wed May 30 20:06:05 2007 -0700 1.2 +++ b/en/fixhtml.py Wed May 30 21:50:21 2007 -0700 1.3 @@ -1,14 +1,33 @@ 1.4 #!/usr/bin/env python 1.5 +# 1.6 +# This script attempts to work around some of the more bizarre and 1.7 +# quirky behaviours of htlatex. 1.8 +# 1.9 +# - We've persuaded htlatex to produce UTF-8, which unfortunately 1.10 +# causes it to use huge character sequences to represent even the 1.11 +# safe 7-bit ASCII subset of UTF-8. We fix that up. 1.12 +# 1.13 +# - BUT we have to treat angle brackets (for example, redirections in 1.14 +# shell script snippets) specially, otherwise they'll break the 1.15 +# generated HTML. (Reported by Johannes Hoff.) 1.16 +# 1.17 +# - For some reason, htlatex gives a unique ID to each fancyvrb 1.18 +# environment, which makes writing a sane, small CSS stylesheet 1.19 +# impossible. We squish all those IDs down to nothing. 1.20 1.21 import os 1.22 import sys 1.23 import re 1.24 1.25 -unicode_re = re.compile(r'�([0-7][0-9a-f]);', re.I) 1.26 +angle_re = re.compile(r'([CE];)') 1.27 +unicode_re = re.compile(r'�([0-7][0-9A-F]);') 1.28 fancyvrb_re = re.compile(r'id="fancyvrb\d+"', re.I) 1.29 1.30 tmpsuffix = '.tmp.' + str(os.getpid()) 1.31 1.32 +def hide_angle(m): 1.33 + return m.group(1).lower() 1.34 + 1.35 def fix_ascii(m): 1.36 return chr(int(m.group(1), 16)) 1.37 1.38 @@ -16,6 +35,7 @@ 1.39 tmpname = name + tmpsuffix 1.40 ofp = file(tmpname, 'w') 1.41 for line in file(name): 1.42 + line = angle_re.sub(hide_angle, line) 1.43 line = unicode_re.sub(fix_ascii, line) 1.44 line = fancyvrb_re.sub('id="fancyvrb"', line) 1.45 ofp.write(line)