#! /usr/bin/python3
# ==================================================================
# Use regexp to break apart lines in a file and put them back
# together modified. In particular, the lines (rows) of a
# html table.
# ==================================================================
import re
IN = 'html_special_characters.html'
ic = 0 # input line count
mc = 0 # modified line count
oc = 0 # output line count
pt = '</td><td>' # regexp pattern
inFile = open(IN,'r')
for line in inFile:
ic += 1 # icrement input count
line = line.strip()
# ---- is this a table row?
if re.match("^.*?</td>",line) != None:
# ---- split the line
x = re.split(pt,line)
##for xx in x:
## print(xx)
# ---- make sure we have the correct number elements
l = len(x)
if l != 4:
print(x)
print('Error: wrong number of elements {} found'.format(l))
break
# ---- re-combine the line elements
# ---- Do not modify element x[[1] if it start with a '&'
# ---- else add '&' and ';' if it does not
if re.match('^&',x[1]) != None:
s = x[0] + pt + x[1] + pt + x[2] + pt + x[3]
else:
s = x[0] + pt + '&' + x[1] + ';' + pt + x[2] + pt + x[3]
print(s) # output the recombined line
mc += 1 # increment modified count
else:
# ---- output non-row lines
print(line)
oc += 1 # increment output count
inFile.close()
##print('{} lines input'.format(ic))
##print('{} lines modified'.format(mc))
##print('{} lines output'.format(oc))