pokecrystal-board/tools/toc.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Usage: python toc.py file.md

Replace a "## TOC" heading in a Markdown file with a table of contents,
generated from the other headings in the file. Supports multiple files.
Headings must start with "##" signs to be detected.
"""

import sys
import re
from collections import namedtuple
from urllib.parse import quote

toc_name = 'Contents'
valid_toc_headings = {'## TOC', '##TOC'}

TocItem = namedtuple('TocItem', ['name', 'anchor', 'level'])
punctuation_rx = re.compile(r'[^\w\- ]+')
numbered_heading_rx = re.compile(r'^[0-9]+\. ')
specialchar_rx = re.compile(r'[⅔]+')

def name_to_anchor(name):
	# GitHub's algorithm for generating anchors from headings
	# https://github.com/jch/html-pipeline/blob/master/lib/html/pipeline/toc_filter.rb
	anchor = name.strip().lower()               # lowercase
	anchor = re.sub(punctuation_rx, '', anchor) # remove punctuation
	anchor = anchor.replace(' ', '-')           # replace spaces with dash
	anchor = re.sub(specialchar_rx, '', anchor) # remove misc special chars
	anchor = quote(anchor)                      # url encode
	return anchor

def get_toc_index(lines):
	toc_index = None
	for i, line in enumerate(lines):
		if line.rstrip() in valid_toc_headings:
			toc_index = i
			break
	return toc_index

def get_toc_items(lines, toc_index):
	for i, line in enumerate(lines):
		if i <= toc_index:
			continue
		if line.startswith('##'):
			name = line.lstrip('#')
			level = len(line) - len(name) - len('##')
			name = name.strip()
			anchor = name_to_anchor(name)
			yield TocItem(name, anchor, level)

def toc_string(toc_items):
	lines = [f'## {toc_name}', '']
	for name, anchor, level in toc_items:
		padding = '  ' * level
		if re.match(numbered_heading_rx, name):
			bullet, name = name.split('.', 1)
			bullet += '.'
			name = name.lstrip()
		else:
			bullet = '-'
		lines.append(f'{padding}{bullet} [{name}](#{anchor})')
	return '\n'.join(lines) + '\n'

def add_toc(filename):
	with open(filename, 'r', encoding='utf-8') as file:
		lines = file.readlines()
	toc_index = get_toc_index(lines)
	if toc_index is None:
		return None # no TOC heading
	toc_items = list(get_toc_items(lines, toc_index))
	if not toc_items:
		return False # no content headings
	with open(filename, 'w', encoding='utf-8') as file:
		for i, line in enumerate(lines):
			if i == toc_index:
				file.write(toc_string(toc_items))
			else:
				file.write(line)
	return True # OK

def main():
	if len(sys.argv) < 2:
		print(f'Usage: {sys.argv[0]} file.md', file=sys.stderr)
		sys.exit(1)
	for filename in sys.argv[1:]:
		print(filename)
		result = add_toc(filename)
		if result is None:
			print('Warning: No "## TOC" heading found', file=sys.stderr)
		elif result is False:
			print('Warning: No content headings found', file=sys.stderr)
		else:
			print('OK')

if __name__ == '__main__':
	main()