Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
duyuefeng0708
GitHub Repository: duyuefeng0708/Cryptography-From-First-Principle
Path: blob/main/scripts/fix_char_source.py
483 views
unlisted
1
#!/usr/bin/env python3
2
"""Fix corrupted notebook cells where source is stored as individual characters.
3
4
Some cells have source = ['#', ' ', 'c', 'o', 'm', 'p', 'u', 't', 'e', '\n', ...]
5
instead of source = ['# compute\n', ...]. This script joins the characters back
6
into proper line-delimited arrays.
7
"""
8
9
import json
10
import sys
11
from pathlib import Path
12
13
14
def is_char_corrupted(source: list[str]) -> bool:
15
"""Check if source is stored as individual characters."""
16
if len(source) <= 10:
17
return False
18
single_chars = sum(1 for s in source if len(s) <= 2)
19
return single_chars / len(source) > 0.8
20
21
22
def fix_source(source: list[str]) -> list[str]:
23
"""Join character-level source back into lines."""
24
full_text = ''.join(source)
25
if not full_text:
26
return source
27
28
# Split into lines, preserving newlines
29
lines = full_text.split('\n')
30
# Re-add newlines to all lines except the last (if it's empty)
31
result = []
32
for i, line in enumerate(lines):
33
if i < len(lines) - 1:
34
result.append(line + '\n')
35
elif line: # last line, only add if non-empty
36
result.append(line)
37
38
return result
39
40
41
def process_notebook(path: Path, dry_run: bool = False) -> int:
42
"""Process a single notebook. Returns number of fixed cells."""
43
with open(path) as f:
44
nb = json.load(f)
45
46
fixed = 0
47
modified = False
48
49
for cell in nb.get('cells', []):
50
source = cell.get('source', [])
51
if not source:
52
continue
53
54
if is_char_corrupted(source):
55
new_source = fix_source(source)
56
fixed += 1
57
if not dry_run:
58
cell['source'] = new_source
59
modified = True
60
61
if modified and not dry_run:
62
with open(path, 'w') as f:
63
json.dump(nb, f, indent=1, ensure_ascii=False)
64
f.write('\n')
65
66
return fixed
67
68
69
def main():
70
dry_run = '--dry-run' in sys.argv
71
repo = Path(__file__).parent.parent
72
73
notebooks = sorted(repo.glob('**/*.ipynb'))
74
notebooks = [nb for nb in notebooks if '.ipynb_checkpoints' not in str(nb)]
75
76
total = 0
77
changed_files = 0
78
79
for nb_path in notebooks:
80
fixes = process_notebook(nb_path, dry_run=dry_run)
81
if fixes > 0:
82
rel = nb_path.relative_to(repo)
83
print(f' {rel}: {fixes} cells fixed')
84
total += fixes
85
changed_files += 1
86
87
mode = 'DRY RUN' if dry_run else 'APPLIED'
88
print(f'\n{mode}: {total} cells fixed across {changed_files} files')
89
90
91
if __name__ == '__main__':
92
main()
93
94