Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
1N3
GitHub Repository: 1N3/Sn1per
Path: blob/master/bin/pyText2pdf.py
2960 views
1
#! /usr/bin/env python
2
"""
3
pyText2Pdf - Python script to convert plain text files into Adobe
4
Acrobat PDF files with support for arbitrary page breaks etc.
5
6
Version 2.0
7
8
Author: Anand B Pillai <abpillai at gmail dot com>
9
10
"""
11
12
# Derived from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/189858
13
14
import sys, os
15
import string
16
import time
17
import optparse
18
import re
19
20
LF_EXTRA=0
21
LINE_END='\015'
22
# form feed character (^L)
23
FF=chr(12)
24
25
ENCODING_STR = """\
26
/Encoding <<
27
/Differences [ 0 /.notdef /.notdef /.notdef /.notdef
28
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
29
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
30
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
31
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
32
/.notdef /.notdef /.notdef /.notdef /space /exclam
33
/quotedbl /numbersign /dollar /percent /ampersand
34
/quoteright /parenleft /parenright /asterisk /plus /comma
35
/hyphen /period /slash /zero /one /two /three /four /five
36
/six /seven /eight /nine /colon /semicolon /less /equal
37
/greater /question /at /A /B /C /D /E /F /G /H /I /J /K /L
38
/M /N /O /P /Q /R /S /T /U /V /W /X /Y /Z /bracketleft
39
/backslash /bracketright /asciicircum /underscore
40
/quoteleft /a /b /c /d /e /f /g /h /i /j /k /l /m /n /o /p
41
/q /r /s /t /u /v /w /x /y /z /braceleft /bar /braceright
42
/asciitilde /.notdef /.notdef /.notdef /.notdef /.notdef
43
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
44
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
45
/dotlessi /grave /acute /circumflex /tilde /macron /breve
46
/dotaccent /dieresis /.notdef /ring /cedilla /.notdef
47
/hungarumlaut /ogonek /caron /space /exclamdown /cent
48
/sterling /currency /yen /brokenbar /section /dieresis
49
/copyright /ordfeminine /guillemotleft /logicalnot /hyphen
50
/registered /macron /degree /plusminus /twosuperior
51
/threesuperior /acute /mu /paragraph /periodcentered
52
/cedilla /onesuperior /ordmasculine /guillemotright
53
/onequarter /onehalf /threequarters /questiondown /Agrave
54
/Aacute /Acircumflex /Atilde /Adieresis /Aring /AE
55
/Ccedilla /Egrave /Eacute /Ecircumflex /Edieresis /Igrave
56
/Iacute /Icircumflex /Idieresis /Eth /Ntilde /Ograve
57
/Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash
58
/Ugrave /Uacute /Ucircumflex /Udieresis /Yacute /Thorn
59
/germandbls /agrave /aacute /acircumflex /atilde /adieresis
60
/aring /ae /ccedilla /egrave /eacute /ecircumflex
61
/edieresis /igrave /iacute /icircumflex /idieresis /eth
62
/ntilde /ograve /oacute /ocircumflex /otilde /odieresis
63
/divide /oslash /ugrave /uacute /ucircumflex /udieresis
64
/yacute /thorn /ydieresis ]
65
>>
66
"""
67
68
INTRO="""\
69
%prog [options] filename
70
71
PyText2Pdf makes a 7-bit clean PDF file from any input file.
72
73
It reads from a named file, and writes the PDF file to a file specified by
74
the user, otherwise to a file with '.pdf' appended to the input file.
75
76
Author: Anand B Pillai."""
77
78
79
class PyText2Pdf(object):
80
""" Text2pdf converter in pure Python """
81
82
def __init__(self):
83
# version number
84
self._version="1.3"
85
# iso encoding flag
86
self._IsoEnc=False
87
# formfeeds flag
88
self._doFFs=False
89
self._progname="PyText2Pdf"
90
self._appname = " ".join((self._progname,str(self._version)))
91
# default font
92
self._font="/Courier"
93
# default font size
94
self._ptSize=10
95
# default vert space
96
self._vertSpace=12
97
self._lines=0
98
# number of characters in a row
99
self._cols=80
100
self._columns=1
101
# page ht
102
self._pageHt=792
103
# page wd
104
self._pageWd=612
105
# input file
106
self._ifile=""
107
# output file
108
self._ofile=""
109
# default tab width
110
self._tab=4
111
# input file descriptor
112
self._ifs=None
113
# output file descriptor
114
self._ofs=None
115
# landscape flag
116
self._landscape=False
117
# Subject
118
self._subject = ''
119
# Author
120
self._author = ''
121
# Keywords
122
self._keywords = []
123
# Custom regexp for page breaks
124
self._pagebreakre = None
125
126
# marker objects
127
self._curobj = 5
128
self._pageObs = [0]
129
self._locations = [0,0,0,0,0,0]
130
self._pageNo=0
131
132
# file position marker
133
self._fpos=0
134
135
def parse_args(self):
136
137
""" Callback function called by argument parser.
138
Helps to remove duplicate code """
139
140
if len(sys.argv)<2:
141
sys.argv.append('-h')
142
143
parser = optparse.OptionParser(usage=INTRO)
144
parser.add_option('-o','--output',dest='outfile',help='Direct output to file OUTFILE',metavar='OUTFILE')
145
parser.add_option('-f','--font',dest='font',help='Use Postscript font FONT (must be in standard 14, default: Courier)',
146
default='Courier')
147
parser.add_option('-I','--isolatin',dest='isolatin',help='Use ISO latin-1 encoding',default=False,action='store_true')
148
parser.add_option('-s','--size',dest='fontsize',help='Use font at PTSIZE points (default=>10)',metavar='PTSIZE',default=10)
149
parser.add_option('-v','--linespace',dest='linespace',help='Use line spacing LINESPACE (deault 12)',metavar='LINESPACE',default=12)
150
parser.add_option('-l','--lines',dest='lines',help='Lines per page (default 60, determined automatically if unspecified)',default=60, metavar=None)
151
parser.add_option('-c','--chars',dest='chars',help='Maximum characters per line (default 80)',default=80,metavar=None)
152
parser.add_option('-t','--tab',dest='tabspace',help='Spaces per tab character (default 4)',default=4,metavar=None)
153
parser.add_option('-F','--ignoreff',dest='formfeed',help='Ignore formfeed character ^L (i.e, accept formfeed characters as pagebreaks)',default=False,action='store_true')
154
parser.add_option('-P','--papersize',dest='papersize',help='Set paper size (default is letter, accepted values are "A4" or "A3")')
155
parser.add_option('-W','--width',dest='width',help='Independent paper width in points',metavar=None,default=612)
156
parser.add_option('-H','--height',dest='height',help='Independent paper height in points',metavar=None,default=792)
157
parser.add_option('-2','--twocolumns',dest='twocolumns',help='Format as two columns',metavar=None,default=False,action='store_true')
158
parser.add_option('-L','--landscape',dest='landscape',help='Format in landscape mode',metavar=None,default=False,action='store_true')
159
parser.add_option('-R','--regexp',dest='pageregexp',help='Regular expression string to determine page breaks (if supplied, this will be used to split text into pages, instead of using line count)',metavar=None)
160
parser.add_option('-S','--subject',dest='subject',help='Optional subject for the document',metavar=None)
161
parser.add_option('-A','--author',dest='author',help='Optional author for the document',metavar=None)
162
parser.add_option('-K','--keywords',dest='keywords',help='Optional list of keywords for the document (separated by commas)',metavar=None)
163
164
165
optlist, args = parser.parse_args()
166
# print optlist.__dict__, args
167
168
if len(args)==0:
169
sys.exit('Error: input file argument missing')
170
elif len(args)>1:
171
sys.exit('Error: Too many arguments')
172
173
self._ifile = args[0]
174
175
d = optlist.__dict__
176
if d.get('isolatin'): self._IsoEnc=True
177
if d.get('formfeed'): self._doFFs = True
178
if d.get('twocolumns'): self._columns = 2
179
if d.get('landscape'): self._landscape = True
180
181
self._font = '/' + d.get('font')
182
psize = d.get('papersize')
183
if psize=='A4':
184
self._pageWd=595
185
self._pageHt=842
186
elif psize=='A3':
187
self._pageWd=842
188
self._pageHt=1190
189
190
fsize = int(d.get('fontsize'))
191
if fsize < 1: fsize = 1
192
self._ptSize = fsize
193
194
lspace = int(d.get('linespace'))
195
if lspace<1: lspace = 1
196
self._vertSpace = lspace
197
198
lines = int(d.get('lines'))
199
if lines<1: lines = 1
200
self._lines = int(lines)
201
202
chars = int(d.get('chars'))
203
if chars<4: chars = 4
204
self._cols = chars
205
206
tab = int(d.get('tabspace'))
207
if tab<1: tab = 1
208
self._tab = tab
209
210
w = int(d.get('width'))
211
if w<72: w=72
212
self._pageWd = w
213
214
h = int(d.get('height'))
215
if h<72: h=72
216
self._pageHt = h
217
218
# Very optional args
219
author = d.get('author')
220
if author: self._author = author
221
222
subject = d.get('subject')
223
if subject: self._subject = subject
224
225
keywords = d.get('keywords')
226
if keywords:
227
self._keywords = keywords.split(',')
228
229
pagebreak = d.get('pageregexp')
230
if pagebreak:
231
self._pagebreakre = re.compile(pagebreak, re.UNICODE|re.IGNORECASE)
232
233
outfile = d.get('outfile')
234
if outfile: self._ofile = outfile
235
236
if self._landscape:
237
print 'Landscape option on...'
238
if self._columns==2:
239
print 'Printing in two columns...'
240
if self._doFFs:
241
print 'Ignoring form feed character...'
242
if self._IsoEnc:
243
print 'Using ISO Latin Encoding...'
244
245
print 'Using font',self._font[1:],'size =', self._ptSize
246
247
def writestr(self, str):
248
""" Write string to output file descriptor.
249
All output operations go through this function.
250
We keep the current file position also here"""
251
252
# update current file position
253
self._fpos += len(str)
254
for x in range(0, len(str)):
255
if str[x] == '\n':
256
self._fpos += LF_EXTRA
257
try:
258
self._ofs.write(str)
259
except IOError, e:
260
print e
261
return -1
262
263
return 0
264
265
def convert(self):
266
""" Perform the actual conversion """
267
268
if self._landscape:
269
# swap page width & height
270
tmp = self._pageHt
271
self._pageHt = self._pageWd
272
self._pageWd = tmp
273
274
if self._lines==0:
275
self._lines = (self._pageHt - 72)/self._vertSpace
276
if self._lines < 1:
277
self._lines=1
278
279
try:
280
self._ifs=open(self._ifile)
281
except IOError, (strerror, errno):
282
print 'Error: Could not open file to read --->', self._ifile
283
sys.exit(3)
284
285
if self._ofile=="":
286
self._ofile = os.path.splitext(self._ifile)[0] + '.pdf'
287
288
try:
289
self._ofs = open(self._ofile, 'wb')
290
except IOError, (strerror, errno):
291
print 'Error: Could not open file to write --->', self._ofile
292
sys.exit(3)
293
294
print 'Input file=>',self._ifile
295
print 'Writing pdf file',self._ofile, '...'
296
self.writeheader()
297
self.writepages()
298
self.writerest()
299
300
print 'Wrote file', self._ofile
301
self._ifs.close()
302
self._ofs.close()
303
return 0
304
305
def writeheader(self):
306
"""Write the PDF header"""
307
308
ws = self.writestr
309
310
title = self._ifile
311
312
t=time.localtime()
313
timestr=str(time.strftime("D:%Y%m%d%H%M%S", t))
314
ws("%PDF-1.4\n")
315
self._locations[1] = self._fpos
316
ws("1 0 obj\n")
317
ws("<<\n")
318
319
buf = "".join(("/Creator (", self._appname, " By Anand B Pillai )\n"))
320
ws(buf)
321
buf = "".join(("/CreationDate (", timestr, ")\n"))
322
ws(buf)
323
buf = "".join(("/Producer (", self._appname, "(\\251 Anand B Pillai))\n"))
324
ws(buf)
325
if self._subject:
326
title = self._subject
327
buf = "".join(("/Subject (",self._subject,")\n"))
328
ws(buf)
329
if self._author:
330
buf = "".join(("/Author (",self._author,")\n"))
331
ws(buf)
332
if self._keywords:
333
buf = "".join(("/Keywords (",' '.join(self._keywords),")\n"))
334
ws(buf)
335
336
if title:
337
buf = "".join(("/Title (", title, ")\n"))
338
ws(buf)
339
340
ws(">>\n")
341
ws("endobj\n")
342
343
self._locations[2] = self._fpos
344
345
ws("2 0 obj\n")
346
ws("<<\n")
347
ws("/Type /Catalog\n")
348
ws("/Pages 3 0 R\n")
349
ws(">>\n")
350
ws("endobj\n")
351
352
self._locations[4] = self._fpos
353
ws("4 0 obj\n")
354
ws("<<\n")
355
buf = "".join(("/BaseFont ", str(self._font), " /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font >>\n"))
356
ws(buf)
357
358
if self._IsoEnc:
359
ws(ENCODING_STR)
360
361
ws(">>\n")
362
ws("endobj\n")
363
364
self._locations[5] = self._fpos
365
366
ws("5 0 obj\n")
367
ws("<<\n")
368
ws(" /Font << /F1 4 0 R >>\n")
369
ws(" /ProcSet [ /PDF /Text ]\n")
370
ws(">>\n")
371
ws("endobj\n")
372
373
def startpage(self):
374
""" Start a page of data """
375
376
ws = self.writestr
377
378
self._pageNo += 1
379
self._curobj += 1
380
381
self._locations.append(self._fpos)
382
self._locations[self._curobj]=self._fpos
383
384
self._pageObs.append(self._curobj)
385
self._pageObs[self._pageNo] = self._curobj
386
387
buf = "".join((str(self._curobj), " 0 obj\n"))
388
389
ws(buf)
390
ws("<<\n")
391
ws("/Type /Page\n")
392
ws("/Parent 3 0 R\n")
393
ws("/Resources 5 0 R\n")
394
395
self._curobj += 1
396
buf = "".join(("/Contents ", str(self._curobj), " 0 R\n"))
397
ws(buf)
398
ws(">>\n")
399
ws("endobj\n")
400
401
self._locations.append(self._fpos)
402
self._locations[self._curobj] = self._fpos
403
404
buf = "".join((str(self._curobj), " 0 obj\n"))
405
ws(buf)
406
ws("<<\n")
407
408
buf = "".join(("/Length ", str(self._curobj + 1), " 0 R\n"))
409
ws(buf)
410
ws(">>\n")
411
ws("stream\n")
412
strmPos = self._fpos
413
414
ws("BT\n");
415
buf = "".join(("/F1 ", str(self._ptSize), " Tf\n"))
416
ws(buf)
417
buf = "".join(("1 0 0 1 50 ", str(self._pageHt - 40), " Tm\n"))
418
ws(buf)
419
buf = "".join((str(self._vertSpace), " TL\n"))
420
ws(buf)
421
422
return strmPos
423
424
def endpage(self, streamStart):
425
"""End a page of data """
426
427
ws = self.writestr
428
429
ws("ET\n")
430
streamEnd = self._fpos
431
ws("endstream\n")
432
ws("endobj\n")
433
434
self._curobj += 1
435
self._locations.append(self._fpos)
436
self._locations[self._curobj] = self._fpos
437
438
buf = "".join((str(self._curobj), " 0 obj\n"))
439
ws(buf)
440
buf = "".join((str(streamEnd - streamStart), '\n'))
441
ws(buf)
442
ws('endobj\n')
443
444
def writepages(self):
445
"""Write pages as PDF"""
446
447
ws = self.writestr
448
449
beginstream=0
450
lineNo, charNo=0,0
451
ch, column=0,0
452
padding,i=0,0
453
atEOF=0
454
linebuf = ''
455
456
while not atEOF:
457
beginstream = self.startpage()
458
column=1
459
460
while column <= self._columns:
461
column += 1
462
atFF=0
463
atBOP=0
464
lineNo=0
465
# Special flag for regexp page break
466
pagebreak = False
467
468
while lineNo < self._lines and not atFF and not atEOF and not pagebreak:
469
linebuf = ''
470
lineNo += 1
471
ws("(")
472
charNo=0
473
474
while charNo < self._cols:
475
charNo += 1
476
ch = self._ifs.read(1)
477
cond = ((ch != '\n') and not(ch==FF and self._doFFs) and (ch != ''))
478
if not cond:
479
# See if this dude matches the pagebreak regexp
480
if self._pagebreakre and self._pagebreakre.search(linebuf.strip()):
481
pagebreak = True
482
483
linebuf = ''
484
break
485
else:
486
linebuf = linebuf + ch
487
488
if ord(ch) >= 32 and ord(ch) <= 127:
489
if ch == '(' or ch == ')' or ch == '\\':
490
ws("\\")
491
ws(ch)
492
else:
493
if ord(ch) == 9:
494
padding =self._tab - ((charNo - 1) % self._tab)
495
for i in range(padding):
496
ws(" ")
497
charNo += (padding -1)
498
else:
499
if ch != FF:
500
# write \xxx form for dodgy character
501
buf = "".join(('\\', ch))
502
ws(buf)
503
else:
504
# dont print anything for a FF
505
charNo -= 1
506
507
ws(")'\n")
508
if ch == FF:
509
atFF=1
510
if lineNo == self._lines:
511
atBOP=1
512
513
if atBOP:
514
pos=0
515
ch = self._ifs.read(1)
516
pos= self._ifs.tell()
517
if ch == FF:
518
ch = self._ifs.read(1)
519
pos=self._ifs.tell()
520
# python's EOF signature
521
if ch == '':
522
atEOF=1
523
else:
524
# push position back by one char
525
self._ifs.seek(pos-1)
526
527
elif atFF:
528
ch = self._ifs.read(1)
529
pos=self._ifs.tell()
530
if ch == '':
531
atEOF=1
532
else:
533
self._ifs.seek(pos-1)
534
535
if column < self._columns:
536
buf = "".join(("1 0 0 1 ",
537
str((self._pageWd/2 + 25)),
538
" ",
539
str(self._pageHt - 40),
540
" Tm\n"))
541
ws(buf)
542
543
self.endpage(beginstream)
544
545
def writerest(self):
546
"""Finish the file"""
547
548
ws = self.writestr
549
self._locations[3] = self._fpos
550
551
ws("3 0 obj\n")
552
ws("<<\n")
553
ws("/Type /Pages\n")
554
buf = "".join(("/Count ", str(self._pageNo), "\n"))
555
ws(buf)
556
buf = "".join(("/MediaBox [ 0 0 ", str(self._pageWd), " ", str(self._pageHt), " ]\n"))
557
ws(buf)
558
ws("/Kids [ ")
559
560
for i in range(1, self._pageNo+1):
561
buf = "".join((str(self._pageObs[i]), " 0 R "))
562
ws(buf)
563
564
ws("]\n")
565
ws(">>\n")
566
ws("endobj\n")
567
568
xref = self._fpos
569
ws("xref\n")
570
buf = "".join(("0 ", str((self._curobj) + 1), "\n"))
571
ws(buf)
572
buf = "".join(("0000000000 65535 f ", str(LINE_END)))
573
ws(buf)
574
575
for i in range(1, self._curobj + 1):
576
val = self._locations[i]
577
buf = "".join((string.zfill(str(val), 10), " 00000 n ", str(LINE_END)))
578
ws(buf)
579
580
ws("trailer\n")
581
ws("<<\n")
582
buf = "".join(("/Size ", str(self._curobj + 1), "\n"))
583
ws(buf)
584
ws("/Root 2 0 R\n")
585
ws("/Info 1 0 R\n")
586
ws(">>\n")
587
588
ws("startxref\n")
589
buf = "".join((str(xref), "\n"))
590
ws(buf)
591
ws("%%EOF\n")
592
593
594
def main():
595
596
pdfclass=PyText2Pdf()
597
pdfclass.parse_args()
598
pdfclass.convert()
599
600
if __name__ == "__main__":
601
main()
602
603