foozy@708: #!/usr/bin/env python foozy@708: # foozy@708: # This script attempts to work around some of the more bizarre and foozy@708: # quirky behaviours of htlatex. foozy@708: # foozy@708: # - We've persuaded htlatex to produce UTF-8, which unfortunately foozy@708: # causes it to use huge character sequences to represent even the foozy@708: # safe 7-bit ASCII subset of UTF-8. We fix that up. foozy@708: # foozy@708: # - BUT we have to treat angle brackets (for example, redirections in foozy@708: # shell script snippets) specially, otherwise they'll break the foozy@708: # generated HTML. (Reported by Johannes Hoff.) foozy@708: # foozy@708: # - For some reason, htlatex gives a unique ID to each fancyvrb foozy@708: # environment, which makes writing a sane, small CSS stylesheet foozy@708: # impossible. We squish all those IDs down to nothing. foozy@708: foozy@708: import os foozy@708: import sys foozy@708: import re foozy@708: foozy@708: angle_re = re.compile(r'([CE];)') foozy@708: unicode_re = re.compile(r'�([0-7][0-9A-F]);') foozy@708: fancyvrb_re = re.compile(r'id="fancyvrb\d+"', re.I) foozy@708: ligature_re = re.compile(r'ྰ([0-4]);') foozy@708: foozy@708: tmpsuffix = '.tmp.' + str(os.getpid()) foozy@708: foozy@708: def hide_angle(m): foozy@708: return m.group(1).lower() foozy@708: foozy@708: def fix_ascii(m): foozy@708: return chr(int(m.group(1), 16)) foozy@708: foozy@708: ligatures = ['ff', 'fi', 'fl', 'ffi', 'ffl'] foozy@708: foozy@708: def expand_ligature(m): foozy@708: return ligatures[int(m.group(1))] foozy@708: foozy@708: for name in sys.argv[1:]: foozy@708: tmpname = name + tmpsuffix foozy@708: ofp = file(tmpname, 'w') foozy@708: for line in file(name): foozy@708: line = angle_re.sub(hide_angle, line) foozy@708: line = unicode_re.sub(fix_ascii, line) foozy@708: line = ligature_re.sub(expand_ligature, line) foozy@708: line = fancyvrb_re.sub('id="fancyvrb"', line) foozy@708: ofp.write(line) foozy@708: ofp.close() foozy@708: os.rename(tmpname, name)