Path: blob/main/scripts/clean_print_formatting.py
483 views
unlisted
#!/usr/bin/env python31"""Clean up ugly ASCII-table formatting in Jupyter notebook code cells.23Targets:41. Separator bars: print("=" * N) and print("-" * N) for N >= 2052. Column-header lines: print(f"{'header':>N} | ...")63. Excessive width specifiers: {:>N}, {:<N}, {:^N} for N >= 57"""89import json10import re11import sys12from pathlib import Path131415def is_separator_line(line: str) -> bool:16"""Check if a line is a pure separator bar like print("=" * 55)."""17stripped = line.strip()18# print("=" * 55) or print("-" * 55)19if re.match(r'^print\(["\'][=\-]["\'] \* \d+\)$', stripped):20n = int(re.search(r'\d+', stripped.split('*')[1]).group())21return n >= 2022return False232425def is_column_header_line(line: str) -> bool:26"""Check if a line is a pure column header with alignment specs and pipes."""27stripped = line.strip()28# Lines like: print(f"{'p':>5} | {'|G|=p-1':>8} | ...")29# Must have multiple | separators and alignment specs30if not stripped.startswith('print(f"') and not stripped.startswith("print(f'"):31return False32# Count alignment specs and pipe separators33align_specs = len(re.findall(r":\s*[><\^]\d+\}", stripped))34pipes = stripped.count(' | ')35# It's a header if it has 3+ alignment specs and 2+ pipes36return align_specs >= 3 and pipes >= 2373839def clean_width_specifiers(line: str) -> str:40"""Remove excessive width specifiers from f-string print lines.4142{:>8} -> {} for N >= 443{:>2} -> {:>2} (keep small padding for digit alignment)44{val:>8} -> {val}45"""46# Match f-string width specs like :>N}, :<N}, :^N} where N >= 447def replace_spec(m):48prefix = m.group(1) # everything before the colon49width = int(m.group(3))50suffix = m.group(4) # closing brace51if width >= 2:52return prefix + suffix53return m.group(0) # keep :>1 etc (unlikely)5455# Pattern: {expr:>N} or {expr:<N} or {expr:^N}56result = re.sub(r'(\{[^}]*?)(:[><\^])(\d+)(\})', replace_spec, line)57return result585960def clean_constant_fstrings(line: str) -> str:61"""Replace f"{'constant'}" patterns with just the constant in print lines.6263print(f'{"message"} {"encrypted"}') -> print('message encrypted')64Only applies to print lines where ALL {} contain string literals.65"""66if 'print(f' not in line:67return line6869# Replace {"string_literal"} and {'string_literal'} with just the string70# This pattern matches f-string expressions that are just quoted constants71result = re.sub(r'\{"([^"{}]*)"\}', r'\1', line)72result = re.sub(r"\{'([^'{}]*)'\}", r'\1', result)7374# If there are no remaining { } expressions, downgrade f-string to plain string75# But only if no other { remain (besides the ones we replaced)76if result != line:77# Check if there are still f-string expressions78# Find the print(f'...' or print(f"..." part79match = re.match(r"(\s*print\(f)(['\"])(.*)\2\)(.*)$", result)80if match:81prefix, quote, body, suffix = match.groups()82# If no { remain in body, downgrade to plain string83if '{' not in body:84result = f"{prefix[:-1]}{quote}{body}{quote}){suffix}"8586return result878889def clean_str_concat_in_fstring(line: str) -> str:90"""Replace {"text" + str(var) + "text"} with text{var}text in f-strings.9192Example:93f'{"a^" + str(p-1) + " mod " + str(p)}' -> f'a^{p-1} mod {p}'94"""95if 'str(' not in line:96return line9798# Pattern: {"literal" + str(expr) + "literal" ...}99# This can chain: {"a" + str(x) + " b " + str(y) + "c"}100# Strategy: find expressions like {"..." + str(...) + "..."} and simplify101102def simplify_concat(m):103"""Simplify a single {concat_expression}."""104inner = m.group(1)105106# Split on ' + ' or " + " while respecting quotes107# Simple approach: try to parse the concatenation108parts = []109remaining = inner.strip()110111while remaining:112remaining = remaining.strip()113if remaining.startswith('"'):114# String literal with double quotes115end = remaining.index('"', 1)116parts.append(('str', remaining[1:end]))117remaining = remaining[end+1:].strip()118if remaining.startswith('+'):119remaining = remaining[1:].strip()120elif remaining.startswith("'"):121# String literal with single quotes122end = remaining.index("'", 1)123parts.append(('str', remaining[1:end]))124remaining = remaining[end+1:].strip()125if remaining.startswith('+'):126remaining = remaining[1:].strip()127elif remaining.startswith('str('):128# str(expr) call129depth = 0130i = 4 # skip 'str('131depth = 1132while i < len(remaining) and depth > 0:133if remaining[i] == '(':134depth += 1135elif remaining[i] == ')':136depth -= 1137i += 1138expr = remaining[4:i-1]139parts.append(('expr', expr))140remaining = remaining[i:].strip()141if remaining.startswith('+'):142remaining = remaining[1:].strip()143else:144# Can't parse, bail out145return m.group(0)146147if not parts:148return m.group(0)149150# Rebuild as plain text + {expr} sequences151result = ''152for kind, val in parts:153if kind == 'str':154result += val155elif kind == 'expr':156result += '{' + val + '}'157158return result159160# Find all {expression} blocks in f-strings that contain str() + concatenation161result = re.sub(r'\{(".*?str\(.*?")\}', simplify_concat, line)162result = re.sub(r"\{('.*?str\(.*?')\}", simplify_concat, result)163164return result165166167def clean_cell_source(source_lines: list[str]) -> tuple[list[str], int]:168"""Clean formatting in a single cell's source lines.169170Returns (cleaned_lines, fix_count).171"""172fixes = 0173cleaned = []174175for line in source_lines:176# Remove separator bars177if is_separator_line(line):178fixes += 1179continue180181# Remove pure column header lines182if is_column_header_line(line):183fixes += 1184continue185186# Clean width specifiers in print lines187new_line = clean_width_specifiers(line)188if new_line != line:189fixes += 1190line = new_line191192# Clean constant f-string expressions like f"{'header'}"193new_line = clean_constant_fstrings(line)194if new_line != line:195fixes += 1196line = new_line197198# Clean string concatenation in f-strings: {"a" + str(x) + "b"} -> a{x}b199new_line = clean_str_concat_in_fstring(line)200if new_line != line:201fixes += 1202line = new_line203204cleaned.append(line)205206return cleaned, fixes207208209def process_notebook(path: Path, dry_run: bool = False) -> int:210"""Process a single notebook. Returns number of fixes."""211with open(path) as f:212nb = json.load(f)213214total_fixes = 0215modified = False216217for cell in nb.get('cells', []):218if cell.get('cell_type') != 'code':219continue220221source = cell.get('source', [])222if not source:223continue224225cleaned, fixes = clean_cell_source(source)226if fixes > 0:227total_fixes += fixes228if not dry_run:229cell['source'] = cleaned230modified = True231232if modified and not dry_run:233with open(path, 'w') as f:234json.dump(nb, f, indent=1, ensure_ascii=False)235f.write('\n')236237return total_fixes238239240def main():241dry_run = '--dry-run' in sys.argv242repo = Path(__file__).parent.parent243244notebooks = sorted(repo.glob('**/*.ipynb'))245# Skip checkpoints246notebooks = [nb for nb in notebooks if '.ipynb_checkpoints' not in str(nb)]247248total = 0249changed_files = 0250251for nb_path in notebooks:252fixes = process_notebook(nb_path, dry_run=dry_run)253if fixes > 0:254rel = nb_path.relative_to(repo)255print(f" {rel}: {fixes} fixes")256total += fixes257changed_files += 1258259mode = "DRY RUN" if dry_run else "APPLIED"260print(f"\n{mode}: {total} fixes across {changed_files} files")261262263if __name__ == '__main__':264main()265266267