"""
Utility for parsing HTML5 entity definitions available from:
https://html.spec.whatwg.org/entities.json
https://html.spec.whatwg.org/multipage/named-characters.html
The page now contains the following note:
"This list is static and will not be expanded or changed in the future."
Written by Ezio Melotti and Iuliia Proskurnia.
"""
import os
import sys
import json
from urllib.request import urlopen
from html.entities import html5
SCRIPT_NAME = 'Tools/build/parse_html5_entities.py'
PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html'
ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json'
HTML5_SECTION_START = '# HTML5 named character references'
def get_json(url):
"""Download the json file from the url and returns a decoded object."""
with urlopen(url) as f:
data = f.read().decode('utf-8')
return json.loads(data)
def create_dict(entities):
"""Create the html5 dict from the decoded json object."""
new_html5 = {}
for name, value in entities.items():
new_html5[name.lstrip('&')] = value['characters']
return new_html5
def compare_dicts(old, new):
"""Compare the old and new dicts and print the differences."""
added = new.keys() - old.keys()
if added:
print('{} entitie(s) have been added:'.format(len(added)))
for name in sorted(added):
print(' {!r}: {!r}'.format(name, new[name]))
removed = old.keys() - new.keys()
if removed:
print('{} entitie(s) have been removed:'.format(len(removed)))
for name in sorted(removed):
print(' {!r}: {!r}'.format(name, old[name]))
changed = set()
for name in (old.keys() & new.keys()):
if old[name] != new[name]:
changed.add((name, old[name], new[name]))
if changed:
print('{} entitie(s) have been modified:'.format(len(changed)))
for item in sorted(changed):
print(' {!r}: {!r} -> {!r}'.format(*item))
def write_items(entities, file=sys.stdout):
"""Write the items of the dictionary in the specified file."""
keys = sorted(entities.keys())
keys = sorted(keys, key=str.lower)
print(HTML5_SECTION_START, file=file)
print(f'# Generated by {SCRIPT_NAME}\n'
f'# from {ENTITIES_URL} and\n'
f'# {PAGE_URL}.\n'
f'# Map HTML5 named character references to the '
f'equivalent Unicode character(s).', file=file)
print('html5 = {', file=file)
for name in keys:
print(f' {name!r}: {entities[name]!a},', file=file)
print('}', file=file)
if __name__ == '__main__':
new_html5 = create_dict(get_json(ENTITIES_URL))
if '--create' in sys.argv:
write_items(new_html5)
elif '--patch' in sys.argv:
fname = 'Lib/html/entities.py'
temp_fname = fname + '.temp'
with open(fname) as f1, open(temp_fname, 'w') as f2:
skip = False
for line in f1:
if line.startswith(HTML5_SECTION_START):
write_items(new_html5, file=f2)
skip = True
continue
if skip:
if line.startswith('}'):
skip = False
continue
f2.write(line)
os.remove(fname)
os.rename(temp_fname, fname)
else:
if html5 == new_html5:
print('The current dictionary is updated.')
else:
compare_dicts(html5, new_html5)
print('Run "./python {0} --patch" to update Lib/html/entities.html '
'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__))