Path: blob/main/scripts/fix_char_source.py
483 views
unlisted
#!/usr/bin/env python31"""Fix corrupted notebook cells where source is stored as individual characters.23Some cells have source = ['#', ' ', 'c', 'o', 'm', 'p', 'u', 't', 'e', '\n', ...]4instead of source = ['# compute\n', ...]. This script joins the characters back5into proper line-delimited arrays.6"""78import json9import sys10from pathlib import Path111213def is_char_corrupted(source: list[str]) -> bool:14"""Check if source is stored as individual characters."""15if len(source) <= 10:16return False17single_chars = sum(1 for s in source if len(s) <= 2)18return single_chars / len(source) > 0.8192021def fix_source(source: list[str]) -> list[str]:22"""Join character-level source back into lines."""23full_text = ''.join(source)24if not full_text:25return source2627# Split into lines, preserving newlines28lines = full_text.split('\n')29# Re-add newlines to all lines except the last (if it's empty)30result = []31for i, line in enumerate(lines):32if i < len(lines) - 1:33result.append(line + '\n')34elif line: # last line, only add if non-empty35result.append(line)3637return result383940def process_notebook(path: Path, dry_run: bool = False) -> int:41"""Process a single notebook. Returns number of fixed cells."""42with open(path) as f:43nb = json.load(f)4445fixed = 046modified = False4748for cell in nb.get('cells', []):49source = cell.get('source', [])50if not source:51continue5253if is_char_corrupted(source):54new_source = fix_source(source)55fixed += 156if not dry_run:57cell['source'] = new_source58modified = True5960if modified and not dry_run:61with open(path, 'w') as f:62json.dump(nb, f, indent=1, ensure_ascii=False)63f.write('\n')6465return fixed666768def main():69dry_run = '--dry-run' in sys.argv70repo = Path(__file__).parent.parent7172notebooks = sorted(repo.glob('**/*.ipynb'))73notebooks = [nb for nb in notebooks if '.ipynb_checkpoints' not in str(nb)]7475total = 076changed_files = 07778for nb_path in notebooks:79fixes = process_notebook(nb_path, dry_run=dry_run)80if fixes > 0:81rel = nb_path.relative_to(repo)82print(f' {rel}: {fixes} cells fixed')83total += fixes84changed_files += 18586mode = 'DRY RUN' if dry_run else 'APPLIED'87print(f'\n{mode}: {total} cells fixed across {changed_files} files')888990if __name__ == '__main__':91main()929394