bos@149: #!/usr/bin/env python bos@251: # bos@251: # This script attempts to work around some of the more bizarre and bos@251: # quirky behaviours of htlatex. bos@251: # bos@251: # - We've persuaded htlatex to produce UTF-8, which unfortunately bos@251: # causes it to use huge character sequences to represent even the bos@251: # safe 7-bit ASCII subset of UTF-8. We fix that up. bos@251: # bos@251: # - BUT we have to treat angle brackets (for example, redirections in bos@251: # shell script snippets) specially, otherwise they'll break the bos@251: # generated HTML. (Reported by Johannes Hoff.) bos@251: # bos@251: # - For some reason, htlatex gives a unique ID to each fancyvrb bos@251: # environment, which makes writing a sane, small CSS stylesheet bos@251: # impossible. We squish all those IDs down to nothing. bos@149: bos@149: import os bos@149: import sys bos@149: import re bos@149: bos@251: angle_re = re.compile(r'([CE];)') bos@251: unicode_re = re.compile(r'�([0-7][0-9A-F]);') bos@149: fancyvrb_re = re.compile(r'id="fancyvrb\d+"', re.I) bos@260: ligature_re = re.compile(r'ྰ([0-4]);') bos@149: bos@149: tmpsuffix = '.tmp.' + str(os.getpid()) bos@149: bos@251: def hide_angle(m): bos@251: return m.group(1).lower() bos@251: bos@149: def fix_ascii(m): bos@149: return chr(int(m.group(1), 16)) bos@149: bos@260: ligatures = ['ff', 'fi', 'fl', 'ffi', 'ffl'] bos@260: bos@260: def expand_ligature(m): bos@260: return ligatures[int(m.group(1))] bos@260: bos@149: for name in sys.argv[1:]: bos@149: tmpname = name + tmpsuffix bos@149: ofp = file(tmpname, 'w') bos@149: for line in file(name): bos@251: line = angle_re.sub(hide_angle, line) bos@149: line = unicode_re.sub(fix_ascii, line) bos@260: line = ligature_re.sub(expand_ligature, line) bos@149: line = fancyvrb_re.sub('id="fancyvrb"', line) bos@149: ofp.write(line) bos@149: ofp.close() bos@149: os.rename(tmpname, name)