CoCalc -- pyText2pdf.py

GitHub Repository: 1N3/Sn1per
Path: blob/master/bin/pyText2pdf.py
²⁹⁶⁰ views
1
#! /usr/bin/env python
2
"""
3
 pyText2Pdf - Python script to convert plain text files into Adobe
4
 Acrobat PDF files with support for arbitrary page breaks etc.
5

6
 Version 2.0
7

8
 Author: Anand B Pillai <abpillai at gmail dot com>
9
    
10
"""
11

12
# Derived from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/189858
13

14
import sys, os
15
import string
16
import time
17
import optparse
18
import re
19

20
LF_EXTRA=0
21
LINE_END='\015'
22
# form feed character (^L)
23
FF=chr(12)
24

25
ENCODING_STR = """\
26
/Encoding <<
27
/Differences [ 0 /.notdef /.notdef /.notdef /.notdef
28
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
29
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
30
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
31
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
32
/.notdef /.notdef /.notdef /.notdef /space /exclam
33
/quotedbl /numbersign /dollar /percent /ampersand
34
/quoteright /parenleft /parenright /asterisk /plus /comma
35
/hyphen /period /slash /zero /one /two /three /four /five
36
/six /seven /eight /nine /colon /semicolon /less /equal
37
/greater /question /at /A /B /C /D /E /F /G /H /I /J /K /L
38
/M /N /O /P /Q /R /S /T /U /V /W /X /Y /Z /bracketleft
39
/backslash /bracketright /asciicircum /underscore
40
/quoteleft /a /b /c /d /e /f /g /h /i /j /k /l /m /n /o /p
41
/q /r /s /t /u /v /w /x /y /z /braceleft /bar /braceright
42
/asciitilde /.notdef /.notdef /.notdef /.notdef /.notdef
43
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
44
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
45
/dotlessi /grave /acute /circumflex /tilde /macron /breve
46
/dotaccent /dieresis /.notdef /ring /cedilla /.notdef
47
/hungarumlaut /ogonek /caron /space /exclamdown /cent
48
/sterling /currency /yen /brokenbar /section /dieresis
49
/copyright /ordfeminine /guillemotleft /logicalnot /hyphen
50
/registered /macron /degree /plusminus /twosuperior
51
/threesuperior /acute /mu /paragraph /periodcentered
52
/cedilla /onesuperior /ordmasculine /guillemotright
53
/onequarter /onehalf /threequarters /questiondown /Agrave
54
/Aacute /Acircumflex /Atilde /Adieresis /Aring /AE
55
/Ccedilla /Egrave /Eacute /Ecircumflex /Edieresis /Igrave
56
/Iacute /Icircumflex /Idieresis /Eth /Ntilde /Ograve
57
/Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash
58
/Ugrave /Uacute /Ucircumflex /Udieresis /Yacute /Thorn
59
/germandbls /agrave /aacute /acircumflex /atilde /adieresis
60
/aring /ae /ccedilla /egrave /eacute /ecircumflex
61
/edieresis /igrave /iacute /icircumflex /idieresis /eth
62
/ntilde /ograve /oacute /ocircumflex /otilde /odieresis
63
/divide /oslash /ugrave /uacute /ucircumflex /udieresis
64
/yacute /thorn /ydieresis ]
65
>>
66
"""
67

68
INTRO="""\
69
%prog [options] filename
70

71
PyText2Pdf  makes a 7-bit clean PDF file from any input file.
72

73
It reads from a named file, and writes the PDF file to a file specified by
74
the user, otherwise to a file with '.pdf' appended to the input file.
75

76
Author: Anand B Pillai."""
77

78

79
class PyText2Pdf(object):
80
    """ Text2pdf converter in pure Python """
81
    
82
    def __init__(self):
83
        # version number
84
        self._version="1.3"
85
        # iso encoding flag
86
        self._IsoEnc=False
87
        # formfeeds flag
88
        self._doFFs=False
89
        self._progname="PyText2Pdf"
90
        self._appname = " ".join((self._progname,str(self._version)))
91
        # default font
92
        self._font="/Courier"
93
        # default font size
94
        self._ptSize=10
95
        # default vert space
96
        self._vertSpace=12
97
        self._lines=0
98
        # number of characters in a row
99
        self._cols=80
100
        self._columns=1
101
        # page ht
102
        self._pageHt=792
103
        # page wd
104
        self._pageWd=612
105
        # input file 
106
        self._ifile=""
107
        # output file 
108
        self._ofile=""
109
        # default tab width
110
        self._tab=4
111
        # input file descriptor
112
        self._ifs=None
113
        # output file descriptor
114
        self._ofs=None
115
        # landscape flag
116
        self._landscape=False
117
        # Subject
118
        self._subject = ''
119
        # Author
120
        self._author = ''
121
        # Keywords
122
        self._keywords = []
123
        # Custom regexp  for page breaks
124
        self._pagebreakre = None
125
        
126
        # marker objects
127
        self._curobj = 5
128
        self._pageObs = [0]
129
        self._locations = [0,0,0,0,0,0]
130
        self._pageNo=0
131

132
        # file position marker
133
        self._fpos=0
134

135
    def parse_args(self):
136
        
137
        """ Callback function called by argument parser.
138
        Helps to remove duplicate code """
139

140
        if len(sys.argv)<2:
141
            sys.argv.append('-h')
142
            
143
        parser = optparse.OptionParser(usage=INTRO)
144
        parser.add_option('-o','--output',dest='outfile',help='Direct output to file OUTFILE',metavar='OUTFILE')
145
        parser.add_option('-f','--font',dest='font',help='Use Postscript font FONT (must be in standard 14, default: Courier)',
146
                          default='Courier')
147
        parser.add_option('-I','--isolatin',dest='isolatin',help='Use ISO latin-1 encoding',default=False,action='store_true')
148
        parser.add_option('-s','--size',dest='fontsize',help='Use font at PTSIZE points (default=>10)',metavar='PTSIZE',default=10)
149
        parser.add_option('-v','--linespace',dest='linespace',help='Use line spacing LINESPACE (deault 12)',metavar='LINESPACE',default=12)
150
        parser.add_option('-l','--lines',dest='lines',help='Lines per page (default 60, determined automatically if unspecified)',default=60, metavar=None)
151
        parser.add_option('-c','--chars',dest='chars',help='Maximum characters per line (default 80)',default=80,metavar=None)
152
        parser.add_option('-t','--tab',dest='tabspace',help='Spaces per tab character (default 4)',default=4,metavar=None)
153
        parser.add_option('-F','--ignoreff',dest='formfeed',help='Ignore formfeed character ^L (i.e, accept formfeed characters as pagebreaks)',default=False,action='store_true')
154
        parser.add_option('-P','--papersize',dest='papersize',help='Set paper size (default is letter, accepted values are "A4" or "A3")')
155
        parser.add_option('-W','--width',dest='width',help='Independent paper width in points',metavar=None,default=612)
156
        parser.add_option('-H','--height',dest='height',help='Independent paper height in points',metavar=None,default=792)
157
        parser.add_option('-2','--twocolumns',dest='twocolumns',help='Format as two columns',metavar=None,default=False,action='store_true')
158
        parser.add_option('-L','--landscape',dest='landscape',help='Format in landscape mode',metavar=None,default=False,action='store_true')
159
        parser.add_option('-R','--regexp',dest='pageregexp',help='Regular expression string to determine page breaks (if supplied, this will be used to split text into pages, instead of using line count)',metavar=None)
160
        parser.add_option('-S','--subject',dest='subject',help='Optional subject for the document',metavar=None)
161
        parser.add_option('-A','--author',dest='author',help='Optional author for the document',metavar=None)
162
        parser.add_option('-K','--keywords',dest='keywords',help='Optional list of keywords for the document (separated by commas)',metavar=None)
163
        
164

165
        optlist, args = parser.parse_args()
166
        # print optlist.__dict__, args
167

168
        if len(args)==0:
169
            sys.exit('Error: input file argument missing')
170
        elif len(args)>1:
171
            sys.exit('Error: Too many arguments')            
172

173
        self._ifile = args[0]
174
        
175
        d = optlist.__dict__
176
        if d.get('isolatin'): self._IsoEnc=True
177
        if d.get('formfeed'): self._doFFs = True
178
        if d.get('twocolumns'): self._columns = 2
179
        if d.get('landscape'): self._landscape = True
180

181
        self._font = '/' + d.get('font')
182
        psize = d.get('papersize')
183
        if psize=='A4':
184
            self._pageWd=595
185
            self._pageHt=842
186
        elif psize=='A3':
187
            self._pageWd=842
188
            self._pageHt=1190
189

190
        fsize = int(d.get('fontsize'))
191
        if fsize < 1: fsize = 1
192
        self._ptSize = fsize
193

194
        lspace = int(d.get('linespace'))
195
        if lspace<1: lspace = 1
196
        self._vertSpace = lspace
197

198
        lines = int(d.get('lines'))
199
        if lines<1: lines = 1
200
        self._lines = int(lines)
201

202
        chars = int(d.get('chars'))
203
        if chars<4: chars = 4
204
        self._cols = chars
205

206
        tab = int(d.get('tabspace'))
207
        if tab<1: tab = 1
208
        self._tab = tab
209

210
        w = int(d.get('width'))
211
        if w<72: w=72
212
        self._pageWd = w
213

214
        h = int(d.get('height'))
215
        if h<72: h=72
216
        self._pageHt = h
217

218
        # Very optional args
219
        author = d.get('author')
220
        if author: self._author = author
221

222
        subject = d.get('subject')
223
        if subject: self._subject = subject
224

225
        keywords = d.get('keywords')
226
        if keywords:
227
            self._keywords = keywords.split(',')
228

229
        pagebreak = d.get('pageregexp')
230
        if pagebreak:
231
            self._pagebreakre = re.compile(pagebreak, re.UNICODE|re.IGNORECASE)
232
        
233
        outfile = d.get('outfile')
234
        if outfile: self._ofile = outfile
235
        
236
        if self._landscape:
237
            print 'Landscape option on...'
238
        if self._columns==2:
239
            print 'Printing in two columns...'
240
        if self._doFFs:
241
            print 'Ignoring form feed character...'
242
        if self._IsoEnc:
243
            print 'Using ISO Latin Encoding...'
244

245
        print 'Using font',self._font[1:],'size =', self._ptSize
246

247
    def writestr(self, str):
248
        """ Write string to output file descriptor.
249
        All output operations go through this function.
250
        We keep the current file position also here"""
251

252
        # update current file position
253
        self._fpos += len(str)
254
        for x in range(0, len(str)):
255
            if str[x] == '\n':
256
                self._fpos += LF_EXTRA
257
        try:
258
            self._ofs.write(str)
259
        except IOError, e:
260
            print e
261
            return -1
262

263
        return 0
264
            
265
    def convert(self):
266
        """ Perform the actual conversion """
267
    
268
        if self._landscape:
269
            # swap page width & height
270
            tmp = self._pageHt
271
            self._pageHt = self._pageWd
272
            self._pageWd = tmp
273

274
        if self._lines==0:
275
            self._lines = (self._pageHt - 72)/self._vertSpace
276
        if self._lines < 1:
277
            self._lines=1
278
        
279
        try:
280
            self._ifs=open(self._ifile)
281
        except IOError, (strerror, errno):
282
            print 'Error: Could not open file to read --->', self._ifile
283
            sys.exit(3)
284

285
        if self._ofile=="":
286
            self._ofile = os.path.splitext(self._ifile)[0] + '.pdf'
287

288
        try:
289
            self._ofs = open(self._ofile, 'wb')
290
        except IOError, (strerror, errno):
291
            print 'Error: Could not open file to write --->', self._ofile
292
            sys.exit(3)
293

294
        print 'Input file=>',self._ifile
295
        print 'Writing pdf file',self._ofile, '...'
296
        self.writeheader()
297
        self.writepages()
298
        self.writerest()
299

300
        print 'Wrote file', self._ofile
301
        self._ifs.close()
302
        self._ofs.close()
303
        return 0
304

305
    def writeheader(self):
306
        """Write the PDF header"""
307

308
        ws = self.writestr
309

310
        title = self._ifile
311
        
312
        t=time.localtime()
313
        timestr=str(time.strftime("D:%Y%m%d%H%M%S", t))
314
        ws("%PDF-1.4\n")
315
        self._locations[1] = self._fpos
316
        ws("1 0 obj\n")
317
        ws("<<\n")
318

319
        buf = "".join(("/Creator (", self._appname, " By Anand B Pillai )\n"))
320
        ws(buf)
321
        buf = "".join(("/CreationDate (", timestr, ")\n"))
322
        ws(buf)
323
        buf = "".join(("/Producer (", self._appname, "(\\251 Anand B Pillai))\n"))
324
        ws(buf)
325
        if self._subject:
326
            title = self._subject
327
            buf = "".join(("/Subject (",self._subject,")\n"))
328
            ws(buf)
329
        if self._author:
330
            buf = "".join(("/Author (",self._author,")\n"))
331
            ws(buf)
332
        if self._keywords:
333
            buf = "".join(("/Keywords (",' '.join(self._keywords),")\n"))
334
            ws(buf)
335

336
        if title:
337
            buf = "".join(("/Title (", title, ")\n"))
338
            ws(buf)
339

340
        ws(">>\n")
341
        ws("endobj\n")
342
    
343
        self._locations[2] = self._fpos
344

345
        ws("2 0 obj\n")
346
        ws("<<\n")
347
        ws("/Type /Catalog\n")
348
        ws("/Pages 3 0 R\n")
349
        ws(">>\n")
350
        ws("endobj\n")
351
        
352
        self._locations[4] = self._fpos
353
        ws("4 0 obj\n")
354
        ws("<<\n")
355
        buf = "".join(("/BaseFont ", str(self._font), " /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font >>\n"))
356
        ws(buf)
357
    
358
        if self._IsoEnc:
359
            ws(ENCODING_STR)
360
            
361
        ws(">>\n")
362
        ws("endobj\n")
363
        
364
        self._locations[5] = self._fpos
365
        
366
        ws("5 0 obj\n")
367
        ws("<<\n")
368
        ws("  /Font << /F1 4 0 R >>\n")
369
        ws("  /ProcSet [ /PDF /Text ]\n")
370
        ws(">>\n")
371
        ws("endobj\n")
372
    
373
    def startpage(self):
374
        """ Start a page of data """
375

376
        ws = self.writestr
377
        
378
        self._pageNo += 1
379
        self._curobj += 1
380

381
        self._locations.append(self._fpos)
382
        self._locations[self._curobj]=self._fpos
383
    
384
        self._pageObs.append(self._curobj)
385
        self._pageObs[self._pageNo] = self._curobj
386
        
387
        buf = "".join((str(self._curobj), " 0 obj\n"))
388

389
        ws(buf)
390
        ws("<<\n")
391
        ws("/Type /Page\n")
392
        ws("/Parent 3 0 R\n")
393
        ws("/Resources 5 0 R\n")
394

395
        self._curobj += 1
396
        buf = "".join(("/Contents ", str(self._curobj), " 0 R\n"))
397
        ws(buf)
398
        ws(">>\n")
399
        ws("endobj\n")
400
        
401
        self._locations.append(self._fpos)
402
        self._locations[self._curobj] = self._fpos
403

404
        buf = "".join((str(self._curobj), " 0 obj\n"))
405
        ws(buf)
406
        ws("<<\n")
407
        
408
        buf = "".join(("/Length ", str(self._curobj + 1), " 0 R\n"))
409
        ws(buf)
410
        ws(">>\n")
411
        ws("stream\n")
412
        strmPos = self._fpos
413
    
414
        ws("BT\n");
415
        buf = "".join(("/F1 ", str(self._ptSize), " Tf\n"))
416
        ws(buf)
417
        buf = "".join(("1 0 0 1 50 ", str(self._pageHt - 40), " Tm\n"))
418
        ws(buf)
419
        buf = "".join((str(self._vertSpace), " TL\n"))
420
        ws(buf)
421
    
422
        return strmPos
423

424
    def endpage(self, streamStart):
425
        """End a page of data """
426
        
427
        ws = self.writestr
428

429
        ws("ET\n")
430
        streamEnd = self._fpos
431
        ws("endstream\n")
432
        ws("endobj\n")
433
    
434
        self._curobj += 1
435
        self._locations.append(self._fpos)
436
        self._locations[self._curobj] = self._fpos
437
    
438
        buf = "".join((str(self._curobj), " 0 obj\n"))
439
        ws(buf)
440
        buf = "".join((str(streamEnd - streamStart), '\n'))
441
        ws(buf)
442
        ws('endobj\n')
443
    
444
    def writepages(self):
445
        """Write pages as PDF"""
446
        
447
        ws = self.writestr
448

449
        beginstream=0
450
        lineNo, charNo=0,0
451
        ch, column=0,0
452
        padding,i=0,0
453
        atEOF=0
454
        linebuf = ''
455
        
456
        while not atEOF:
457
            beginstream = self.startpage()
458
            column=1
459
            
460
            while column <= self._columns:
461
                column += 1
462
                atFF=0
463
                atBOP=0
464
                lineNo=0
465
                # Special flag for regexp page break
466
                pagebreak = False
467
                
468
                while lineNo < self._lines and not atFF and not atEOF and not pagebreak:
469
                    linebuf = ''
470
                    lineNo += 1
471
                    ws("(")
472
                    charNo=0
473
                    
474
                    while charNo < self._cols:
475
                        charNo += 1
476
                        ch = self._ifs.read(1)
477
                        cond = ((ch != '\n') and not(ch==FF and self._doFFs) and (ch != ''))
478
                        if not cond:
479
                            # See if this dude matches the pagebreak regexp
480
                            if self._pagebreakre and self._pagebreakre.search(linebuf.strip()):
481
                                pagebreak = True
482
                                
483
                            linebuf = ''
484
                            break
485
                        else:
486
                            linebuf = linebuf + ch
487

488
                        if ord(ch) >= 32 and ord(ch) <= 127:
489
                            if ch == '(' or ch == ')' or ch == '\\':
490
                                ws("\\")
491
                            ws(ch)
492
                        else:
493
                            if ord(ch) == 9:
494
                                padding =self._tab - ((charNo - 1) % self._tab)
495
                                for i in range(padding):
496
                                    ws(" ")
497
                                charNo += (padding -1)
498
                            else:
499
                                if ch != FF:
500
                                    # write \xxx form for dodgy character
501
                                    buf = "".join(('\\', ch))
502
                                    ws(buf)
503
                                else:
504
                                    # dont print anything for a FF
505
                                    charNo -= 1
506

507
                    ws(")'\n")
508
                    if ch == FF:
509
                        atFF=1
510
                    if lineNo == self._lines:
511
                        atBOP=1
512
                        
513
                    if atBOP:
514
                        pos=0
515
                        ch = self._ifs.read(1)
516
                        pos= self._ifs.tell()
517
                        if ch == FF:
518
                            ch = self._ifs.read(1)
519
                            pos=self._ifs.tell()
520
                        # python's EOF signature
521
                        if ch == '':
522
                            atEOF=1
523
                        else:
524
                            # push position back by one char
525
                            self._ifs.seek(pos-1)
526

527
                    elif atFF:
528
                        ch = self._ifs.read(1)
529
                        pos=self._ifs.tell()
530
                        if ch == '':
531
                            atEOF=1
532
                        else:
533
                            self._ifs.seek(pos-1)
534

535
                if column < self._columns:
536
                    buf = "".join(("1 0 0 1 ",
537
                                   str((self._pageWd/2 + 25)),
538
                                   " ",
539
                                   str(self._pageHt - 40),
540
                                   " Tm\n"))
541
                    ws(buf)
542

543
            self.endpage(beginstream)
544

545
    def writerest(self):
546
        """Finish the file"""
547

548
        ws = self.writestr
549
        self._locations[3] = self._fpos
550
    
551
        ws("3 0 obj\n")
552
        ws("<<\n")
553
        ws("/Type /Pages\n")
554
        buf = "".join(("/Count ", str(self._pageNo), "\n"))
555
        ws(buf)
556
        buf = "".join(("/MediaBox [ 0 0 ", str(self._pageWd), " ", str(self._pageHt), " ]\n"))
557
        ws(buf)
558
        ws("/Kids [ ")
559
    
560
        for i in range(1, self._pageNo+1):
561
            buf = "".join((str(self._pageObs[i]), " 0 R "))
562
            ws(buf)
563

564
        ws("]\n")
565
        ws(">>\n")
566
        ws("endobj\n")
567
        
568
        xref = self._fpos
569
        ws("xref\n")
570
        buf = "".join(("0 ", str((self._curobj) + 1), "\n"))
571
        ws(buf)
572
        buf = "".join(("0000000000 65535 f ", str(LINE_END)))
573
        ws(buf)
574

575
        for i in range(1, self._curobj + 1):
576
            val = self._locations[i]
577
            buf = "".join((string.zfill(str(val), 10), " 00000 n ", str(LINE_END)))
578
            ws(buf)
579

580
        ws("trailer\n")
581
        ws("<<\n")
582
        buf = "".join(("/Size ", str(self._curobj + 1), "\n"))
583
        ws(buf)
584
        ws("/Root 2 0 R\n")
585
        ws("/Info 1 0 R\n")
586
        ws(">>\n")
587
        
588
        ws("startxref\n")
589
        buf = "".join((str(xref), "\n"))
590
        ws(buf)
591
        ws("%%EOF\n")
592
        
593

594
def main():
595
    
596
    pdfclass=PyText2Pdf()
597
    pdfclass.parse_args()
598
    pdfclass.convert()
599

600
if __name__ == "__main__":
601
    main()
602

603
Product

Resources

Company