bos@584: #!/usr/bin/env python bos@584: # bos@584: # Add unique ID attributes to para tags. This script should only be bos@584: # run by one person, since otherwise it introduces the possibility of bos@584: # chaotic conflicts among tags. bos@584: bos@584: import glob, os, re, sys bos@584: bos@584: tagged = re.compile(']* id="x_([0-9a-f]+)"[^>]*>', re.M) bos@584: untagged = re.compile('') bos@584: bos@584: names = glob.glob('ch*.xml') + glob.glob('app*.xml') bos@584: bos@584: # First pass: find the highest-numbered paragraph ID. bos@584: bos@584: biggest_id = 0 bos@584: seen = set() bos@584: errs = 0 bos@584: bos@584: for name in names: bos@584: for m in tagged.finditer(open(name).read()): bos@584: i = int(m.group(1),16) bos@584: if i in seen: bos@584: print >> sys.stderr, '%s: duplication of ID %s' % (name, i) bos@584: errs += 1 bos@584: seen.add(i) bos@584: if i > biggest_id: bos@584: biggest_id = i bos@584: bos@584: def retag(s): bos@584: global biggest_id bos@584: biggest_id += 1 bos@584: return '' % biggest_id bos@584: bos@584: # Second pass: add IDs to paragraphs that currently lack them. bos@584: bos@584: for name in names: bos@584: f = open(name).read() bos@584: f1 = untagged.sub(retag, f) bos@584: if f1 != f: bos@584: tmpname = name + '.tmp' bos@584: fp = open(tmpname, 'w') bos@584: fp.write(f1) bos@584: fp.close() bos@584: os.rename(tmpname, name) bos@584: bos@584: sys.exit(errs)