Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# ##### BEGIN GPL LICENSE BLOCK #####
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
# ##### END GPL LICENSE BLOCK #####
# <pep8 compliant>
"""Functions for dealing with PDF files.
"""
__author__ = "howard.trickey@gmail.com"
import re
import sys
try:
import zlib
except ImportError:
zlib = None
# Python 2 and 3 differ in the type of result of indexing into
# a 'bytes' string, so define a function that returns the
# integer value regardless of the python version.
if sys.version_info[0] == 3:
def ordat(b, i):
return b[i]
else:
def ordat(b, i):
return ord(b[i])
WARN = True # print Warnings about strange things?
# PDF objects
OBOOL = 0
ONUM = 1
OSTRING = 2
ONAME = 3
OARRAY = 4
ODICT = 5
OSTREAM = 6
ONULL = 7
OINDIRECTDEF = 8
OINDIRECTREF = 9
_re_psbool = re.compile(br'true|false')
_re_psint = re.compile(br'(\+|-)?[0-9]+')
_re_psreal = re.compile(br'((\+|-)?([0-9]+\.[0-9]*)|(\.[0-9]+))')
_re_psstring = re.compile(br'\((\\.|.)*?\)')
_re_pshexstring = re.compile(br'<[0-9A-Fa-f]*>')
_re_psname = re.compile(br'/([^\0\t\n\f\r \(\)<>[\]{}/%]*)')
_re_psnull = re.compile(br'null')
_re_pskeyword = re.compile(br'[A-Za-z]+')
_re_psstreameol = re.compile(br'\r\n|\n')
_re_psendstream = re.compile(br'endstream')
_re_pseol = re.compile(br'(\r\n|\n|\r)')
_re_pswhitespaceandcomments = re.compile(br'([\0\t\n\f\r ]|%[^\n\r]*[\n\r]+)*')
# Object Notes:
# The string re is not really right - the spec allows
# balanced parentheses to appear in a string unescaped.
# Also, strings need to convert the following escapes:
# \n \r \t \b \f \( \) \\ \ddd (octal).
# Octal chars ddd can have 1, 2, or 3 octal digits with high order
# overflow ignored and leading zeros as needed (but not required,
# needed only if a normal digit follows the octal one).
#
# Hex strings may have odd number of digits - the last is assumed to be 0
#
# The token / (with no regular characters after it) is a valid name
# Names may include arbitrary characters by writing its 2-digit hex
# code preceded by a '#'.
#
# Array objects: sequence of objects enclosed in [ and ]
#
# Dictionary objects: sequence of key-value pairs enclosed in << and >>
# The keys must be names. A dictionary entry whose value is null is
# equivalent to an absent entry.
# By convention, dictonary objects have a value with key /Type that
# identifies the type of dictionary. Sometimes there is a subtype,
# with key /Subtype or /S. Values of type and subtype are always names.
#
# Stream objects: a dictionary followed by zero or more lines of bytes
# bracketed by keywords "stream" and "endstream". All streams must be
# indirect objects, and the dictionary must be a direct object.
#
# An Indirect Object Definition: two ints (object #, generation #)
# followed by the object bracketed by "obj" and "endobj"
#
# An Indirect Object Reference: two ints (object #, generation #) followed
# by "R".
# An indirect object reference to an object not defined in the file
# is taken as a reference to the null object.
def GetPDFObject(s, i):
"""Get one complete object starting at s[i].
Args:
s: bytes holding contents of a PDF file
i: index into s
Returns:
((objectid, value), int) - where int is i after the object
use None instead of (objectid, value) if
there are no more objects in s
"""
m = _re_pswhitespaceandcomments.match(s, i)
if m:
i = m.end()
if i == len(s):
return (None, i)
m = _re_psname.match(s, i)
if m:
return ((ONAME, s[m.start() + 1:m.end()].decode()), m.end())
m = _re_psreal.match(s, i)
if m:
return ((ONUM, float(m.group())), m.end())
m = _re_psint.match(s, i)
if m:
# could also be start of indirect object def or ref
(o, j) = GetPDFIndirectObjectRefOrDef(s, i)
if o is not None:
return (o, j)
else:
return ((ONUM, int(m.group())), m.end())
m = _re_psbool.match(s, i)
if m:
if m.group == b'true':
v = True
else:
v = False
return ((OBOOL, v), m.end())
m = _re_psnull.match(s, i)
if m:
return ((ONULL, None), m.end())
m = _re_psstring.match(s, i)
if m:
return GetPDFLiteralString(s, i)
m = _re_pshexstring.match(s, i)
if m:
return GetPDFHexString(s, i, m.end())
c = ordat(s, i)
if c == ord('['):
return GetPDFArray(s, i)
elif c == ord('<') and i < len(s) - 1 and ordat(s, i + 1) == ord('<'):
(o, j) = GetPDFDict(s, i)
# check if followed by stream
(w, k) = GetPDFKeyword(s, j)
if w == b'stream':
m = _re_psstreameol.match(s, k)
if m:
streamstart = m.end()
streamend = s.find(b'endstream', streamstart)
if streamend > 0:
# just return byte offsets in s where stream
# contents start and (most probably) end
return ((OSTREAM, (o[1], streamstart, streamend)),
streamend + 9)
return (o, j)
return (None, i + 1)
def GetPDFIndirectObjectRefOrDef(s, i):
"""Get an indirect object def or ref starting at s[i].
Args:
s: bytes holding contents of a PDF file
i: index into s
Returns:
((OINDIRECTDEF, (obj#,gen#,obj)), newi)
or ((OINDIRECTREF, (obj#, gen#)), newi)
or (None, i)
"""
(v, j) = GetPDFTwoInts(s, i)
if v is None:
return (None, i)
(obj_number, gen_number) = v
(w, j) = GetPDFKeyword(s, j)
if w == b'R':
return ((OINDIRECTREF, (obj_number, gen_number)), j)
elif w == b'obj':
(obj, j) = GetPDFObject(s, j)
if obj is not None:
(w, j) = GetPDFKeyword(s, j)
if w == b'endobj':
return ((OINDIRECTDEF, (obj_number, gen_number, obj)), j)
return (None, i)
def GetPDFTwoInts(s, i):
"""If there are two ints starting at s[i], return them, else return None.
Args:
s: PDF file contents
i: index into s
Returns:
((int, int), newi) or (None, i)
"""
m = _re_pswhitespaceandcomments.match(s, i)
j = i
if m:
j = m.end()
if j == len(s):
return (None, i)
m = _re_psint.match(s, j)
if m:
a = int(m.group())
j = m.end()
m = _re_pswhitespaceandcomments.match(s, j)
if m:
j = m.end()
if j == len(s):
return (None, i)
m = _re_psint.match(s, j)
if m:
b = int(m.group())
return ((a, b), m.end())
return (None, i)
def GetPDFKeyword(s, i):
"""Get a keyword (alphabetic chars, as byte string) starting at s[i].
If there is not a keyword there, just return (b'',i).
Args:
s: bytes holding contents of a PDF file
i: index into s
Returns:
(keyword, newi) - keyword will be b'' if couldn't find one
"""
m = _re_pswhitespaceandcomments.match(s, i)
j = i
if m:
j = m.end()
if j == len(s):
return (b'', i)
m = _re_pskeyword.match(s, j)
if m:
return (m.group(), m.end())
return (b'', i)
def GetPDFLiteralString(s, i):
"""Convert and return object for pdf literal string starting at s[i]."""
j = i + 1
balen = 0
v = []
while j < len(s):
c = ordat(s, j)
if c == ord(')'):
if balen == 0:
return ((OSTRING, ''.join(v)), j + 1)
else:
balen -= 1
elif c == ord('('):
balen += 1
elif c == ord('\\'):
j += 1
if j == len(s):
if WARN:
print("unterminated string at", i)
return ((OSTRING, ''.join(v)), j)
c = ordat(s, j)
if c == ord('n'):
v += '\n'
elif c == ord('f'):
v += '\f'
elif c == ord('r'):
v += '\r'
elif c == ord('t'):
v += '\t'
elif ord('0') <= c <= ord('7'):
x = c - ord('0')
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
j += 1
if j < len(s):
c = ordat(s, j)
if ord('0') <= c <= ord('7'):
x = x * 8 + c - ord('0')
j += 1
if j < len(s):
c = ordat(s, j)
if ord('0') <= c <= ord('7'):
x = x * 8 + c - ord('0')
if x >= 256:
x %= 256
v += chr(x)
else:
m = _re_pseol.match(s, j)
if m:
# backslash used for line continuation
j = m.end() - 1 # -1 to compensate for +1 that will happen
else:
v += chr(c)
else:
m = _re_pseol.match(s, j)
if m:
v += '\n'
j = m.end() - 1 # -1 to compensate for +1
else:
v += chr(c)
j += 1
if WARN:
print('unterminated string at', i)
return ((OSTRING, ''.join(v)), j)
def GetPDFHexString(s, i, iend):
"""Convert and return pdf hex string starting at s[i],
ending at s[iend-1]."""
j = i + 1
v = []
c = ''
jend = iend - 1
while j < jend:
p = _re_pswhitespaceandcomments.match(s, j)
if p:
j = p.end()
d = chr(ordat(s, j))
if c != '':
v.append(FromHexPair(c, d))
c = ''
else:
c = d
j += 1
if c != '':
v.append(FromHexPair(c, '0'))
return ((OSTRING, ''.join(v)), iend)
def FromHexPair(a, b):
"""Interpret string a+b as hex pair, and return the pair's value."""
try:
v = int(a + b, 16)
except TypeError:
if WARN:
print('funny hex pair', a + b)
v = 0
return chr(v)
def GetPDFArray(s, i):
"""Convert and return object array starting at s[i]."""
j = i + 1
v = []
while j < len(s):
m = _re_pswhitespaceandcomments.match(s, j)
if m:
j = m.end()
if j == len(s):
break
if ordat(s, j) == ord(']'):
return ((OARRAY, v), j + 1)
(o, j) = GetPDFObject(s, j)
if o is None:
break
v.append(o)
if WARN:
print('unterminated array starting at', i)
return ((OARRAY, v), j)
def GetPDFDict(s, i):
"""Convert and return object dict starting at s[i]."""
j = i + 2
v = {}
while j < len(s):
m = _re_pswhitespaceandcomments.match(s, j)
if m:
j = m.end()
if j == len(s):
break
if ordat(s, j) == ord('>') and ordat(s, j + 1) == ord('>'):
return ((ODICT, v), j + 2)
(o, j) = GetPDFObject(s, j)
if o is None:
break
if o[0] != ONAME:
if WARN:
print('expected name at', i)
break
name = o[1]
(o, j) = GetPDFObject(s, j)
if o is None:
break
v[name] = o
if WARN:
print('unterminated dict starting at', i)
return ((ODICT, v), j)
# Crossref dict:
# Cross references are a way of turning an (object #, generation #) into
# an actual object in the file, when and indirect reference of the form
# object# generation# R
# is found in another object.
# Cross references are of two types:
# 1) uncompressed: you find the object at a specified byte offset in the file
# 2) compressed: you find the object in an object stream which is in turn found
# by looking for a specified object# with implicit generation 0.
# We will build a map from (object#, generation#) to a tuple
# (kind, field2, field3)
# where if kind==XUNCOMPRESSED then field2 is the file byte offset of the object and field2
# is its generation #
# and if kind==XCOMPRESSED then field2 is the object # of the object stream containing it,
# and field3 is the index of that object within the stream
XUNCOMPRESSED = 1
XCOMPRESSED = 2
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
def GetPDFTrailerAndCrossrefs(s):
"""Find and return the (last) PDF trailer dictionary and cross reference
dict.
Args:
s: PDF file (as bytes)
Returns:
(trailer dict, crossref dict)
"""
startxrefi = s.rfind(b'startxref')
if startxrefi == -1:
if WARN:
print('cannot find startxref')
return (None, None)
crossrefi = -1
m = _re_pseol.match(s, startxrefi + 9)
if m:
m2 = _re_psint.match(s, m.end())
if m2:
crossrefi = int(m2.group())
if crossrefi <= 0:
if WARN:
print('cannot find crossref index')
return (None, None)
crossrefs = {}
d = None
last_trailerdict = None
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
if s[crossrefi:crossrefi+4] != b'xref':
# Could be Crossref stream
(obj, j) = GetPDFObject(s, crossrefi)
if PDFObjHasType(obj, OINDIRECTDEF):
strobj = obj[1][2]
if PDFObjHasType(strobj, OSTREAM):
strxrefs = GetPDFStreamContents(strobj, s, {}, False)
if strxrefs is None:
if WARN:
print('cannot decode crossref stream')
return (None, {})
d = strobj[1][0]
w = GetTypedValFromDictEntry(d, 'W', OARRAY, s, {})
ty = GetTypedValFromDictEntry(d, 'Type', ONAME, s, {})
sz = GetTypedValFromDictEntry(d, 'Size', ONUM, s, {})
index = GetTypedValFromDictEntry(d, 'Index', OARRAY, s, {})
prev = GetTypedValFromDictEntry(d, 'Prev', ONUM, s, {})
if ty != 'XRef' or sz is None or w is None:
if WARN:
print('something wrong with XRef stream dictionary')
return (None, {})
n1 = w[0][1]
n2 = w[1][1]
n3 = w[2][1]
ntot = n1 + n2 + n3
firstobjnum = 0
numobjs = sz
if index is not None:
firstobjnum = index[0][1]
numobjs = index[1][1]
k = 0
objnum = firstobjnum
while k + ntot <= len(strxrefs):
if n1 == 0:
f1 = 1
else:
(f1, k) = GetPDFMultiByteInt(strxrefs, k, n1)
(f2, k) = GetPDFMultiByteInt(strxrefs, k, n2)
if n3 == 0:
f3 = 0
else:
(f3, k) = GetPDFMultiByteInt(strxrefs, k, n3)
if f1 == 1:
crossrefs[(objnum, f3)] = (XUNCOMPRESSED, f2, f3)
elif f1 == 2:
crossrefs[(objnum, 0)] = (XCOMPRESSED, f2, f3)
elif f1 != 0:
if WARN:
print('unexpected type in XRef:', f1)
return (None, {})
objnum += 1
else:
if WARN:
print("no xref and object there is not stream")
print (obj)
else:
if WARN:
print("no xref and not indirect def")
print(obj)
return (d, crossrefs)
while crossrefi > 0:
i = crossrefi
if s[i:i + 4] != b'xref':
if WARN:
print('cannot find xref')
break
m = _re_pseol.match(s, i + 4)
if m:
i = m.end()
while i < startxrefi:
# Get start of subsection
(v, i) = GetPDFTwoInts(s, i)
if v is None:
break
(idstart, nentries) = v
m = _re_pswhitespaceandcomments.match(s, i)
if m:
i = m.end()
for k in range(idstart, idstart + nentries):
byteoffset = int(s[i:i + 10])
gen = int(s[i + 11:i + 16])
inuse = (ordat(s, i + 17) == ord('n'))
if inuse:
crossrefs[(k, gen)] = (XUNCOMPRESSED, byteoffset, gen)
i += 20
# Should be at 'trailer' now
(w, i) = GetPDFKeyword(s, i)
if w != b'trailer':
if WARN:
print('cannot find trailer')
break
(trailero, i) = GetPDFObject(s, i)
if trailero is None or trailero[0] != ODICT:
if WARN:
print('cannot find trailer dict')
break
trailerdict = trailero[1]
if last_trailerdict is None:
last_trailerdict = trailerdict
if 'Prev' in trailerdict:
crossrefi = GetTypedValFromDictEntry(trailerdict, 'Prev', ONUM, s, crossrefs)
if crossrefi is None:
crossrefi = -1
else:
crossrefi = -1
return (last_trailerdict, crossrefs)
def GetPDFMultiByteInt(s, i, fieldlen):
"""Get a multibyte int from a string of bytes
Args:
s: string of bytes
i: int, offset in s to start getting the result
fieldlen: int, how many bytes to get
Returns:
int: accumulated multibyte value (high order byte first in s)
"""
ans = 0
for k in range(i, i + fieldlen):
ans = ans * 256 + ord(s[k])
return (ans, i + fieldlen)
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
def ReadPDFPageOneContents(filename):
"""Read a PDF file and return Content string for its first page.
Args:
filename: name of file
Returns:
string: Content string for first page
"""
try:
f = open(filename, "rb") # binary since some parts may be compressed
except IOError:
if WARN:
print("Can't open file", filename)
return ''
contents = f.read()
f.close()
return GetPDFPageOneContents(contents)
def GetPDFPageOneContents(s):
"""Find and return first page in PDF file, given as string.
First get the last trailer's dictionary, which should contain
the Root object, and also the crossref dictionary which gives
byte offsets for all indirect objects.
Then from Root object, find Pages object (a page tree), and
follow leftmost Kid until get to a leaf Page object, which
in turn has the desired Contents object, which is a stream
or an array of streams. Decompress (if necessary) the
stream(s) and return their concatenation.
Args:
s: bytes holding contents of a PDF file
Returns:
string: the decoded (possibly decompressed) contents of the first page
"""
(trailerdict, crossrefs) = GetPDFTrailerAndCrossrefs(s)
if not trailerdict or not crossrefs:
if WARN:
print('problem finding trailer or crossrefs')
return ''
if 'Root' not in trailerdict:
if WARN:
print('cannot find Root object')
return ''
root = GetTypedValFromDictEntry(trailerdict, 'Root', ODICT, s, crossrefs)
if root is None:
if WARN:
print('cannot find root dictionary')
return ''
pagesdict = GetTypedValFromDictEntry(root, 'Pages', ODICT, s, crossrefs)
if pagesdict is None:
if WARN:
print('cannot find Pages dictionary')
return ''
pnode = pagesdict
while pnode:
pnodetype = PDFDictType(pnode)
if pnodetype == 'Pages':
kidsarray = GetTypedValFromDictEntry(pnode, 'Kids', OARRAY, s,
crossrefs)
if not kidsarray:
if WARN:
print('cannot find Kids in Pages')
return ''
if len(kidsarray) == 0:
if WARN:
print('Kids array has no Page')
return ''
pnodeobj = GetPDFObjFromIndirectRef(kidsarray[0], s, crossrefs)
if PDFObjHasType(pnodeobj, ODICT):
pnode = pnodeobj[1]
else:
if WARN:
print('Kids element has unexpected type')
return ''
elif pnodetype == 'Page':
contentsobj = GetPDFObjFromDictEntry(pnode, 'Contents', s,
crossrefs)
if contentsobj is None:
# it is legal for there to be no contents object:
# means empty page
if WARN:
print('First Page is empty')
return ''
if contentsobj[0] == OSTREAM:
return GetPDFStreamContents(contentsobj, s, crossrefs)
elif contentsobj[0] == OARRAY:
pieces = []
for c in contentsobj[1]:
if not PDFObjHasType(c, OINDIRECTREF):
if WARN:
print('Contents obj child not an indirect ref')
return ''
o = GetPDFObjFromIndirectRef(c, s, crossrefs)
if not PDFObjHasType(o, OSTREAM):
if WARN:
print('Contents obj child not a stream')
return ''
pieces.append(GetPDFStreamContents(o, s, crossrefs))
return '\n'.join(pieces)
else:
if WARN:
print('Contents object has unexpected type',
contentsobj[0])
return ''
else:
if WARN:
print('Page tree node has unexpected type', pnodetype)
return ''
# shouldn't get here
return ''
def GetPDFObjFromIndirectRef(obj, s, crossrefs):
"""Return the Object that is referred to by an indirect reference.
Args:
obj: (int, value) - should be (OINDIRECTREF, (obj_number, gen_number))
s: string - contents of PDF file
crossrefs: dict - maps (obj_number, gen_number) to crossref triple
Returns:
(objectid, value) - the referred value (inside containing OINDIRECTDEF)
or None if there is any problem
"""
if not PDFObjHasType(obj, OINDIRECTREF):
return None
key = obj[1]
if key not in crossrefs:
return None
(f1, f2, f3) = crossrefs[key]
if f1 == XUNCOMPRESSED:
if f2 < 0 or f2 >= len(s):
return None
(o, _) = GetPDFObject(s, f2)
elif f1 == XCOMPRESSED:
o = GetPDFCompressedObject(s, f2, f3, crossrefs)
return o
else:
if WARN:
print("Bad xref type")
return None
if PDFObjHasType(o, OINDIRECTDEF):
return o[1][2]
else:
return None
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
def GetPDFCompressedObject(s, strnum, oindex, crossrefs):
"""Get one complete object from compressed stream.
Args:
s : bytes holding contents of a PDF file
strnum: object number of object stream where object is
oindex: index of object within the stream
crossrefs: dict - maps (obj_number, gen_number) to crossref triple
Returns:
(objectid, value) - or None, if no such object
"""
strkey = (strnum, 0)
if strkey not in crossrefs:
if WARN:
print("could not find object", strnum, "in crossrefs")
return None
(g1, g2, g3) = crossrefs[strkey]
if g1 != XUNCOMPRESSED:
if WARN:
print("stream object is not uncompressed", g1, g2, g3)
return None
if g2 < 0 or g2 >= len(s):
return None
(ostream, _) = GetPDFObject(s, g2)
if PDFObjHasType(ostream, OINDIRECTDEF):
ostream = ostream[1][2]
if not PDFObjHasType(ostream, OSTREAM):
if WARN:
print("stream object does not have type stream")
return None
streamcont = GetPDFStreamContents(ostream, s, crossrefs, False)
d = ostream[1][0]
ty = GetTypedValFromDictEntry(d, "Type", ONAME, s, crossrefs)
if ty != "ObjStm":
if WARN:
print("stream object does not have Type ObjStm")
return None
n = GetTypedValFromDictEntry(d, "N", ONUM, s, crossrefs)
first = GetTypedValFromDictEntry(d, "First", ONUM, s, crossrefs)
if not n or not first:
if WARN:
print("required n or first not in object stream")
return None
i = 0
ans = None
for count in range(n):
(intpair, i) = GetPDFTwoInts(streamcont, i)
if not intpair:
if WARN:
print("stream object did not find int pair at count", count)
return None
(id, off) = intpair
obj = GetPDFObject(streamcont, first + off)
if count == oindex:
if obj:
ans = obj[0]
break
return ans
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
def GetPDFObjFromDictEntry(d, entryname, s, crossrefs):
"""Return the PDF object that should be at given entry in d.
Follow any Indirect refs until get a real object.
"""
if entryname not in d:
return None
o = d[entryname]
if PDFObjHasType(o, OINDIRECTREF):
return GetPDFObjFromIndirectRef(o, s, crossrefs)
else:
return o
def GetTypedValFromDictEntry(d, entryname, ty, s, crossrefs):
"""Return the value that should be found by entry in d and have type ty."""
o = GetPDFObjFromDictEntry(d, entryname, s, crossrefs)
if PDFObjHasType(o, ty):
return o[1]
else:
return None
def PDFObjHasType(o, ty):
"""Return True if o, a PDF Object, has type ty."""
if o is None:
return False
return o[0] == ty
def PDFDictType(d):
"""Return string value of 'Type' entry in d, '' if not there."""
if 'Type' in d:
tyobj = d['Type']
if PDFObjHasType(tyobj, ONAME):
return tyobj[1]
return ''
def GetPDFStreamContents(contentsobj, s, crossrefs, dodecode=True):
"""Return the contents of a stream object, applying any needed filters.
For now, only handle FlateDecode filter, and with no DecodeParms.
Args:
contentsobj: (OSTREAM, (dict, istart, iend))
s: bytes - PDF file contents
crossrefs: dict - maps (obj_number, gen_number) to byte offset in s
dodecode: bool - should we decode too?
Returns:
string - the contents (if dodecode, decoded using latin1 decoder)
"""
if not PDFObjHasType(contentsobj, OSTREAM):
return None
(d, istart, _) = contentsobj[1]
length = GetTypedValFromDictEntry(d, 'Length', ONUM, s, crossrefs)
if length is None:
return ''
ans = s[istart:istart + length]
filterobj = GetPDFObjFromDictEntry(d, 'Filter', s, crossrefs)
if filterobj is None:
return ans.decode()
filters = []
if PDFObjHasType(filterobj, ONAME):
filters = [filterobj[1]]
elif PDFObjHasType(filterobj, OARRAY):
for o in filterobj[1]:
if PDFObjHasType(o, ONAME):
filters.append(o[1])
for fname in filters:
if fname == 'FlateDecode':
if not zlib:
raise RuntimeError("pdf decoding requires missing zlib module")
parms = GetTypedValFromDictEntry(d, 'DecodeParms', ODICT, s, crossrefs)
needPngPredictor = False
columns = 1
if parms is not None:
predictor = GetTypedValFromDictEntry(parms, 'Predictor', ONUM, s, crossrefs)
columns = GetTypedValFromDictEntry(parms, 'Columns', ONUM, s, crossrefs)
if predictor is not None:
if predictor == 1:
pass
elif predictor < 10:
if WARN:
print('unhandled predictor type', predictor)
else:
if columns is None:
columns = 1
needPngPredictor = True
ans = zlib.decompress(ans)
if needPngPredictor:
ansbytes = []
col1 = columns + 1
k = 0
currow = [0] * columns
while k + col1 <= len(ans):
if ans[k] != 2:
if WARN:
print('unhandled PNG predictor type: ', ans[k])
k += 1
for j in range(0, columns):
currow[j] = (currow[j] + ans[k + j]) & 0xFF
ansbytes.append(chr(currow[j]))
k += columns
if k != len(ans):
if WARN:
print("FlateDecode with prediction didn't consume all bytes")
ans = ''.join(ansbytes)
if dodecode:
ans = ans.decode(encoding='latin1', errors='ignore')
else:
if WARN:
print('unhandled stream filter', fname)
return ''
return ans
if __name__ == "__main__":
if len(sys.argv) == 2:
page1contents = ReadPDFPageOneContents(sys.argv[1])
sys.stdout.write(page1contents)