bos@584: #!/usr/bin/env python
bos@584: #
bos@584: # Add unique ID attributes to para tags.  This script should only be
bos@584: # run by one person, since otherwise it introduces the possibility of
bos@584: # chaotic conflicts among tags.
bos@584: 
bos@584: import glob, os, re, sys
bos@584: 
bos@584: tagged = re.compile('<para[^>]* id="x_([0-9a-f]+)"[^>]*>', re.M)
bos@584: untagged = re.compile('<para>')
bos@584: 
bos@584: names = glob.glob('ch*.xml') + glob.glob('app*.xml')
bos@584: 
bos@584: # First pass: find the highest-numbered paragraph ID.
bos@584: 
bos@584: biggest_id = 0
bos@584: seen = set()
bos@584: errs = 0
bos@584: 
bos@584: for name in names:
bos@584:     for m in tagged.finditer(open(name).read()):
bos@584:         i = int(m.group(1),16)
bos@584:         if i in seen:
bos@584:             print >> sys.stderr, '%s: duplication of ID %s' % (name, i)
bos@584:             errs += 1
bos@584:         seen.add(i)
bos@584:         if i > biggest_id:
bos@584:             biggest_id = i
bos@584: 
bos@584: def retag(s):
bos@584:     global biggest_id
bos@584:     biggest_id += 1
bos@584:     return '<para id="x_%x">' % biggest_id
bos@584: 
bos@584: # Second pass: add IDs to paragraphs that currently lack them.
bos@584: 
bos@584: for name in names:
bos@584:     f = open(name).read()
bos@584:     f1 = untagged.sub(retag, f)
bos@584:     if f1 != f:
bos@584:         tmpname = name + '.tmp'
bos@584:         fp = open(tmpname, 'w')
bos@584:         fp.write(f1)
bos@584:         fp.close()
bos@584:         os.rename(tmpname, name)
bos@584: 
bos@584: sys.exit(errs)