Skip to content
Snippets Groups Projects
pdf.py 29.2 KiB
Newer Older
  • Learn to ignore specific revisions
  • # ##### BEGIN GPL LICENSE BLOCK #####
    #
    #  This program is free software; you can redistribute it and/or
    #  modify it under the terms of the GNU General Public License
    #  as published by the Free Software Foundation; either version 2
    #  of the License, or (at your option) any later version.
    #
    #  This program is distributed in the hope that it will be useful,
    #  but WITHOUT ANY WARRANTY; without even the implied warranty of
    #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    #  GNU General Public License for more details.
    #
    #  You should have received a copy of the GNU General Public License
    #  along with this program; if not, write to the Free Software Foundation,
    #  Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
    #
    # ##### END GPL LICENSE BLOCK #####
    
    # <pep8 compliant>
    
    """Functions for dealing with PDF files.
    """
    
    __author__ = "howard.trickey@gmail.com"
    
    import re
    import sys
    try:
        import zlib
    except ImportError:
        zlib = None
    
    # Python 2 and 3 differ in the type of result of indexing into
    # a 'bytes' string, so define a function that returns the
    # integer value regardless of the python version.
    if sys.version_info[0] == 3:
        def ordat(b, i):
            return b[i]
    else:
        def ordat(b, i):
            return ord(b[i])
    
    WARN = True  # print Warnings about strange things?
    
    # PDF objects
    OBOOL = 0
    ONUM = 1
    OSTRING = 2
    ONAME = 3
    OARRAY = 4
    ODICT = 5
    OSTREAM = 6
    ONULL = 7
    OINDIRECTDEF = 8
    OINDIRECTREF = 9
    
    _re_psbool = re.compile(br'true|false')
    _re_psint = re.compile(br'(\+|-)?[0-9]+')
    _re_psreal = re.compile(br'((\+|-)?([0-9]+\.[0-9]*)|(\.[0-9]+))')
    _re_psstring = re.compile(br'\((\\.|.)*?\)')
    _re_pshexstring = re.compile(br'<[0-9A-Fa-f]*>')
    _re_psname = re.compile(br'/([^\0\t\n\f\r \(\)<>[\]{}/%]*)')
    _re_psnull = re.compile(br'null')
    _re_pskeyword = re.compile(br'[A-Za-z]+')
    _re_psstreameol = re.compile(br'\r\n|\n')
    _re_psendstream = re.compile(br'endstream')
    _re_pseol = re.compile(br'(\r\n|\n|\r)')
    _re_pswhitespaceandcomments = re.compile(br'([\0\t\n\f\r ]|%[^\n\r]*[\n\r]+)*')
    
    # Object Notes:
    # The string re is not really right - the spec allows
    # balanced parentheses to appear in a string unescaped.
    # Also, strings need to convert the following escapes:
    #   \n \r \t \b \f \( \) \\ \ddd (octal).
    # Octal chars ddd can have 1, 2, or 3 octal digits with high order
    # overflow ignored and leading zeros as needed (but not required,
    # needed only if a normal digit follows the octal one).
    #
    # Hex strings may have odd number of digits - the last is assumed to be 0
    #
    # The token / (with no regular characters after it) is a valid name
    # Names may include arbitrary characters by writing its 2-digit hex
    # code preceded by a '#'.
    #
    # Array objects: sequence of objects enclosed in [ and ]
    #
    # Dictionary objects: sequence of key-value pairs enclosed in << and >>
    # The keys must be names.  A dictionary entry whose value is null is
    # equivalent to an absent entry.
    # By convention, dictonary objects have a value with key /Type that
    # identifies the type of dictionary. Sometimes there is a subtype,
    # with key /Subtype or /S.  Values of type and subtype are always names.
    #
    # Stream objects: a dictionary followed by zero or more lines of bytes
    # bracketed by keywords "stream" and "endstream". All streams must be
    # indirect objects, and the dictionary must be a direct object.
    #
    # An Indirect Object Definition: two ints (object #, generation #)
    # followed by the object bracketed by "obj" and "endobj"
    #
    # An Indirect Object Reference: two ints (object #, generation #) followed
    # by "R".
    # An indirect object reference to an object not defined in the file
    # is taken as a reference to the null object.
    
    
    def GetPDFObject(s, i):
        """Get one complete object starting at s[i].
    
        Args:
          s: bytes holding contents of a PDF file
          i: index into s
        Returns:
          ((objectid, value), int) - where int is i after the object
                                     use None instead of (objectid, value) if
                                     there are no more objects in s
        """
    
        m = _re_pswhitespaceandcomments.match(s, i)
        if m:
            i = m.end()
        if i == len(s):
            return (None, i)
        m = _re_psname.match(s, i)
        if m:
            return ((ONAME, s[m.start() + 1:m.end()].decode()), m.end())
        m = _re_psreal.match(s, i)
        if m:
            return ((ONUM, float(m.group())), m.end())
        m = _re_psint.match(s, i)
        if m:
            # could also be start of indirect object def or ref
            (o, j) = GetPDFIndirectObjectRefOrDef(s, i)
            if o is not None:
                return (o, j)
            else:
                return ((ONUM, int(m.group())), m.end())
        m = _re_psbool.match(s, i)
        if m:
            if m.group == b'true':
                v = True
            else:
                v = False
            return ((OBOOL, v), m.end())
        m = _re_psnull.match(s, i)
        if m:
            return ((ONULL, None), m.end())
        m = _re_psstring.match(s, i)
        if m:
            return GetPDFLiteralString(s, i)
        m = _re_pshexstring.match(s, i)
        if m:
            return GetPDFHexString(s, i, m.end())
        c = ordat(s, i)
        if c == ord('['):
            return GetPDFArray(s, i)
        elif c == ord('<') and i < len(s) - 1 and ordat(s, i + 1) == ord('<'):
            (o, j) = GetPDFDict(s, i)
            # check if followed by stream
            (w, k) = GetPDFKeyword(s, j)
            if w == b'stream':
                m = _re_psstreameol.match(s, k)
                if m:
                    streamstart = m.end()
                    streamend = s.find(b'endstream', streamstart)
                    if streamend > 0:
                        # just return byte offsets in s where stream
                        # contents start and (most probably) end
                        return ((OSTREAM, (o[1], streamstart, streamend)),
                            streamend + 9)
            return (o, j)
        return (None, i + 1)
    
    
    def GetPDFIndirectObjectRefOrDef(s, i):
        """Get an indirect object def or ref starting at s[i].
    
        Args:
          s: bytes holding contents of a PDF file
          i: index into s
        Returns:
          ((OINDIRECTDEF, (obj#,gen#,obj)), newi)
          or ((OINDIRECTREF, (obj#, gen#)), newi)
          or (None, i)
        """
    
        (v, j) = GetPDFTwoInts(s, i)
        if v is None:
            return (None, i)
        (obj_number, gen_number) = v
        (w, j) = GetPDFKeyword(s, j)
        if w == b'R':
            return ((OINDIRECTREF, (obj_number, gen_number)), j)
        elif w == b'obj':
            (obj, j) = GetPDFObject(s, j)
            if obj is not None:
                (w, j) = GetPDFKeyword(s, j)
                if w == b'endobj':
                    return ((OINDIRECTDEF, (obj_number, gen_number, obj)), j)
        return (None, i)
    
    
    def GetPDFTwoInts(s, i):
        """If there are two ints starting at s[i], return them, else return None.
    
        Args:
          s: PDF file contents
          i: index into s
        Returns:
          ((int, int), newi) or (None, i)
        """
    
        m = _re_pswhitespaceandcomments.match(s, i)
        j = i
        if m:
            j = m.end()
        if j == len(s):
            return (None, i)
        m = _re_psint.match(s, j)
        if m:
            a = int(m.group())
            j = m.end()
            m = _re_pswhitespaceandcomments.match(s, j)
            if m:
                j = m.end()
            if j == len(s):
                return (None, i)
            m = _re_psint.match(s, j)
            if m:
                b = int(m.group())
                return ((a, b), m.end())
        return (None, i)
    
    
    def GetPDFKeyword(s, i):
        """Get a keyword (alphabetic chars, as byte string) starting at s[i].
    
        If there is not a keyword there, just return (b'',i).
    
        Args:
          s: bytes holding contents of a PDF file
          i: index into s
        Returns:
          (keyword, newi) - keyword will be b'' if couldn't find one
        """
    
        m = _re_pswhitespaceandcomments.match(s, i)
        j = i
        if m:
            j = m.end()
        if j == len(s):
            return (b'', i)
        m = _re_pskeyword.match(s, j)
        if m:
            return (m.group(), m.end())
        return (b'', i)
    
    
    def GetPDFLiteralString(s, i):
        """Convert and return object for pdf literal string starting at s[i]."""
    
        j = i + 1
        balen = 0
        v = []
        while j < len(s):
            c = ordat(s, j)
            if c == ord(')'):
                if balen == 0:
                    return ((OSTRING, ''.join(v)), j + 1)
                else:
                    balen -= 1
            elif c == ord('('):
                balen += 1
            elif c == ord('\\'):
                j += 1
                if j == len(s):
                    if WARN:
                        print("unterminated string at", i)
                    return ((OSTRING, ''.join(v)), j)
                c = ordat(s, j)
                if c == ord('n'):
                    v += '\n'
                elif c == ord('f'):
                    v += '\f'
                elif c == ord('r'):
                    v += '\r'
                elif c == ord('t'):
                    v += '\t'
                elif ord('0') <= c <= ord('7'):
    
                    j += 1
                    if j < len(s):
                        c = ordat(s, j)
                        if ord('0') <= c <= ord('7'):
                            x = x * 8 + c - ord('0')
                            j += 1
                            if j < len(s):
                                c = ordat(s, j)
                                if ord('0') <= c <= ord('7'):
                                    x = x * 8 + c - ord('0')
                    if x >= 256:
                        x %= 256
                    v += chr(x)
                else:
                    m = _re_pseol.match(s, j)
                    if m:
                        # backslash used for line continuation
                        j = m.end() - 1  # -1 to compensate for +1 that will happen
                    else:
                        v += chr(c)
            else:
                m = _re_pseol.match(s, j)
                if m:
                    v += '\n'
                    j = m.end() - 1  # -1 to compensate for +1
                else:
                    v += chr(c)
            j += 1
        if WARN:
            print('unterminated string at', i)
        return ((OSTRING, ''.join(v)), j)
    
    
    def GetPDFHexString(s, i, iend):
        """Convert and return pdf hex string starting at s[i],
        ending at s[iend-1]."""
    
        j = i + 1
        v = []
        c = ''
        jend = iend - 1
        while j < jend:
            p = _re_pswhitespaceandcomments.match(s, j)
            if p:
                j = p.end()
            d = chr(ordat(s, j))
            if c != '':
                v.append(FromHexPair(c, d))
                c = ''
            else:
                c = d
            j += 1
        if c != '':
            v.append(FromHexPair(c, '0'))
        return ((OSTRING, ''.join(v)), iend)
    
    
    def FromHexPair(a, b):
        """Interpret string a+b as hex pair, and return the pair's value."""
    
        try:
            v = int(a + b, 16)
        except TypeError:
            if WARN:
                print('funny hex pair', a + b)
            v = 0
        return chr(v)
    
    
    def GetPDFArray(s, i):
        """Convert and return object array starting at s[i]."""
    
        j = i + 1
        v = []
        while j < len(s):
            m = _re_pswhitespaceandcomments.match(s, j)
            if m:
                j = m.end()
                if j == len(s):
                    break
            if ordat(s, j) == ord(']'):
                return ((OARRAY, v), j + 1)
            (o, j) = GetPDFObject(s, j)
            if o is None:
                break
            v.append(o)
        if WARN:
            print('unterminated array starting at', i)
        return ((OARRAY, v), j)
    
    
    def GetPDFDict(s, i):
        """Convert and return object dict starting at s[i]."""
    
        j = i + 2
        v = {}
        while j < len(s):
            m = _re_pswhitespaceandcomments.match(s, j)
            if m:
                j = m.end()
                if j == len(s):
                    break
            if ordat(s, j) == ord('>') and ordat(s, j + 1) == ord('>'):
                return ((ODICT, v), j + 2)
            (o, j) = GetPDFObject(s, j)
            if o is None:
                break
            if o[0] != ONAME:
                if WARN:
                    print('expected name at', i)
                break
            name = o[1]
            (o, j) = GetPDFObject(s, j)
            if o is None:
                break
            v[name] = o
        if WARN:
            print('unterminated dict starting at', i)
        return ((ODICT, v), j)
    
    
    # Crossref dict:
    # Cross references are a way of turning an (object #, generation #) into
    # an actual object in the file, when and indirect reference of the form
    #    object# generation# R
    # is found in another object.
    # Cross references are of two types:
    # 1) uncompressed: you find the object at a specified byte offset in the file
    # 2) compressed: you find the object in an object stream which is in turn found
    #    by looking for a specified object# with implicit generation 0.
    # We will build a map from (object#, generation#) to a tuple
    #    (kind, field2, field3)
    # where if kind==XUNCOMPRESSED then field2 is the file byte offset of the object and field2
    #                              is its generation #
    # and   if kind==XCOMPRESSED then field2 is the object # of the object stream containing it,
    #                              and field3 is the index of that object within the stream
    XUNCOMPRESSED = 1
    XCOMPRESSED = 2
    
    
    def GetPDFTrailerAndCrossrefs(s):
        """Find and return the (last) PDF trailer dictionary and cross reference
        dict.
    
        Args:
          s: PDF file (as bytes)
        Returns:
          (trailer dict, crossref dict)
        """
    
        startxrefi = s.rfind(b'startxref')
        if startxrefi == -1:
            if WARN:
                print('cannot find startxref')
            return (None, None)
        crossrefi = -1
        m = _re_pseol.match(s, startxrefi + 9)
        if m:
            m2 = _re_psint.match(s, m.end())
            if m2:
                crossrefi = int(m2.group())
        if crossrefi <= 0:
            if WARN:
                print('cannot find crossref index')
            return (None, None)
        crossrefs = {}
    
        if s[crossrefi:crossrefi+4] != b'xref':
            # Could be Crossref stream
            (obj, j) = GetPDFObject(s, crossrefi)
            if PDFObjHasType(obj, OINDIRECTDEF):
                strobj = obj[1][2]
                if PDFObjHasType(strobj, OSTREAM):
                    strxrefs = GetPDFStreamContents(strobj, s, {}, False)
                    if strxrefs is None:
                        if WARN:
                            print('cannot decode crossref stream')
                        return (None, {})
                    d = strobj[1][0]
                    w = GetTypedValFromDictEntry(d, 'W', OARRAY, s, {})
                    ty = GetTypedValFromDictEntry(d, 'Type', ONAME, s, {})
                    sz = GetTypedValFromDictEntry(d, 'Size', ONUM, s, {})
                    index = GetTypedValFromDictEntry(d, 'Index', OARRAY, s, {})
                    prev = GetTypedValFromDictEntry(d, 'Prev', ONUM, s, {})
                    if ty != 'XRef' or sz is None or w is None:
                        if WARN:
                            print('something wrong with XRef stream dictionary')
                        return (None, {})
                    n1 = w[0][1]
                    n2 = w[1][1]
                    n3 = w[2][1]
                    ntot = n1 + n2 + n3
                    firstobjnum = 0
                    numobjs = sz
                    if index is not None:
                        firstobjnum = index[0][1]
                        numobjs = index[1][1]
                    k = 0
                    objnum = firstobjnum
                    while k + ntot <= len(strxrefs):
                        if n1 == 0:
                            f1 = 1
                        else:
                            (f1, k) = GetPDFMultiByteInt(strxrefs, k, n1)
                        (f2, k) = GetPDFMultiByteInt(strxrefs, k, n2)
                        if n3 == 0:
                            f3 = 0
                        else:
                            (f3, k) = GetPDFMultiByteInt(strxrefs, k, n3)
                        if f1 == 1:
                            crossrefs[(objnum, f3)] = (XUNCOMPRESSED, f2, f3)
                        elif f1 == 2:
                            crossrefs[(objnum, 0)] = (XCOMPRESSED, f2, f3)
                        elif f1 != 0:
                            if WARN:
                                print('unexpected type in XRef:', f1)
                            return (None, {})
                        objnum += 1
                else:
                    if WARN:
                        print("no xref and object there is not stream")
                    print (obj)
            else:
                if WARN:
                    print("no xref and not indirect def")
                print(obj)
            return (d, crossrefs)
    
        while crossrefi > 0:
            i = crossrefi
            if s[i:i + 4] != b'xref':
                if WARN:
                    print('cannot find xref')
                break
            m = _re_pseol.match(s, i + 4)
            if m:
                i = m.end()
            while i < startxrefi:
                # Get start of subsection
                (v, i) = GetPDFTwoInts(s, i)
                if v is None:
                    break
                (idstart, nentries) = v
                m = _re_pswhitespaceandcomments.match(s, i)
                if m:
                    i = m.end()
                for k in range(idstart, idstart + nentries):
                    byteoffset = int(s[i:i + 10])
                    gen = int(s[i + 11:i + 16])
                    inuse = (ordat(s, i + 17) == ord('n'))
                    if inuse:
    
                        crossrefs[(k, gen)] = (XUNCOMPRESSED, byteoffset, gen)
    
                    i += 20
            # Should be at 'trailer' now
            (w, i) = GetPDFKeyword(s, i)
            if w != b'trailer':
                if WARN:
                    print('cannot find trailer')
                break
            (trailero, i) = GetPDFObject(s, i)
            if trailero is None or trailero[0] != ODICT:
                if WARN:
                    print('cannot find trailer dict')
                    break
            trailerdict = trailero[1]
            if last_trailerdict is None:
                last_trailerdict = trailerdict
            if 'Prev' in trailerdict:
    
                crossrefi = GetTypedValFromDictEntry(trailerdict, 'Prev', ONUM, s, crossrefs)
                if crossrefi is None:
                    crossrefi = -1
    
            else:
                crossrefi = -1
        return (last_trailerdict, crossrefs)
    
    
    def GetPDFMultiByteInt(s, i, fieldlen):
        """Get a multibyte int from a string of bytes
    
        Args:
          s: string of bytes
          i: int, offset in s to start getting the result
          fieldlen: int, how many bytes to get
        Returns:
          int: accumulated multibyte value (high order byte first in s)
        """
    
        ans = 0
        for k in range(i, i + fieldlen):
            ans = ans * 256 + ord(s[k])
        return (ans, i + fieldlen)
    
    
    def ReadPDFPageOneContents(filename):
        """Read a PDF file and return Content string for its first page.
    
        Args:
          filename: name of file
        Returns:
          string: Content string for first page
        """
    
        try:
            f = open(filename, "rb")  # binary since some parts may be compressed
        except IOError:
            if WARN:
                print("Can't open file", filename)
            return ''
        contents = f.read()
        f.close()
        return GetPDFPageOneContents(contents)
    
    
    def GetPDFPageOneContents(s):
        """Find and return first page in PDF file, given as string.
    
        First get the last trailer's dictionary, which should contain
        the Root object, and also the crossref dictionary which gives
        byte offsets for all indirect objects.
        Then from Root object, find Pages object (a page tree), and
        follow leftmost Kid until get to a leaf Page object, which
        in turn has the desired Contents object, which is a stream
        or an array of streams. Decompress (if necessary) the
        stream(s) and return their concatenation.
    
        Args:
          s: bytes holding contents of a PDF file
        Returns:
          string: the decoded (possibly decompressed) contents of the first page
        """
    
        (trailerdict, crossrefs) = GetPDFTrailerAndCrossrefs(s)
        if not trailerdict or not crossrefs:
            if WARN:
                print('problem finding trailer or crossrefs')
            return ''
        if 'Root' not in trailerdict:
            if WARN:
                print('cannot find Root object')
            return ''
        root = GetTypedValFromDictEntry(trailerdict, 'Root', ODICT, s, crossrefs)
        if root is None:
            if WARN:
                print('cannot find root dictionary')
                return ''
        pagesdict = GetTypedValFromDictEntry(root, 'Pages', ODICT, s, crossrefs)
        if pagesdict is None:
            if WARN:
                print('cannot find Pages dictionary')
            return ''
        pnode = pagesdict
        while pnode:
            pnodetype = PDFDictType(pnode)
            if pnodetype == 'Pages':
                kidsarray = GetTypedValFromDictEntry(pnode, 'Kids', OARRAY, s,
                    crossrefs)
                if not kidsarray:
                    if WARN:
                        print('cannot find Kids in Pages')
                    return ''
                if len(kidsarray) == 0:
                    if WARN:
                        print('Kids array has no Page')
                    return ''
                pnodeobj = GetPDFObjFromIndirectRef(kidsarray[0], s, crossrefs)
                if PDFObjHasType(pnodeobj, ODICT):
                    pnode = pnodeobj[1]
                else:
                    if WARN:
                        print('Kids element has unexpected type')
                        return ''
            elif pnodetype == 'Page':
                contentsobj = GetPDFObjFromDictEntry(pnode, 'Contents', s,
                    crossrefs)
                if contentsobj is None:
                    # it is legal for there to be no contents object:
                    # means empty page
                    if WARN:
                        print('First Page is empty')
                    return ''
                if contentsobj[0] == OSTREAM:
                    return GetPDFStreamContents(contentsobj, s, crossrefs)
                elif contentsobj[0] == OARRAY:
                    pieces = []
                    for c in contentsobj[1]:
                        if not PDFObjHasType(c, OINDIRECTREF):
                            if WARN:
                                print('Contents obj child not an indirect ref')
                            return ''
                        o = GetPDFObjFromIndirectRef(c, s, crossrefs)
                        if not PDFObjHasType(o, OSTREAM):
                            if WARN:
                                print('Contents obj child not a stream')
                            return ''
                        pieces.append(GetPDFStreamContents(o, s, crossrefs))
                    return '\n'.join(pieces)
                else:
                    if WARN:
                        print('Contents object has unexpected type',
                            contentsobj[0])
                    return ''
            else:
                if WARN:
                    print('Page tree node has unexpected type', pnodetype)
                return ''
        # shouldn't get here
        return ''
    
    
    def GetPDFObjFromIndirectRef(obj, s, crossrefs):
        """Return the Object that is referred to by an indirect reference.
    
        Args:
          obj: (int, value) - should be (OINDIRECTREF, (obj_number, gen_number))
          s: string - contents of PDF file
    
          crossrefs: dict - maps (obj_number, gen_number) to crossref triple
    
        Returns:
          (objectid, value) - the referred value (inside containing OINDIRECTDEF)
                              or None if there is any problem
        """
    
        if not PDFObjHasType(obj, OINDIRECTREF):
            return None
        key = obj[1]
        if key not in crossrefs:
            return None
    
        (f1, f2, f3) = crossrefs[key]
        if f1 == XUNCOMPRESSED:
            if f2 < 0 or f2 >= len(s):
                return None
            (o, _) = GetPDFObject(s, f2)
        elif f1 == XCOMPRESSED:
            o = GetPDFCompressedObject(s, f2, f3, crossrefs)
            return o
        else:
            if WARN:
                print("Bad xref type")
    
            return None
        if PDFObjHasType(o, OINDIRECTDEF):
            return o[1][2]
        else:
            return None
    
    
    
    def GetPDFCompressedObject(s, strnum, oindex, crossrefs):
        """Get one complete object from compressed stream.
    
        Args:
          s : bytes holding contents of a PDF file
          strnum: object number of object stream where object is
          oindex: index of object within the stream
          crossrefs: dict - maps (obj_number, gen_number) to crossref triple
        Returns:
          (objectid, value) - or None, if no such object
        """
    
        strkey = (strnum, 0)
        if strkey not in crossrefs:
            if WARN:
                print("could not find object", strnum, "in crossrefs")
            return None
        (g1, g2, g3) = crossrefs[strkey]
        if g1 != XUNCOMPRESSED:
            if WARN:
                print("stream object is not uncompressed", g1, g2, g3)
            return None
        if g2 < 0 or g2 >= len(s):
            return None
        (ostream, _) = GetPDFObject(s, g2)
        if PDFObjHasType(ostream, OINDIRECTDEF):
            ostream = ostream[1][2]
        if not PDFObjHasType(ostream, OSTREAM):
            if WARN:
                print("stream object does not have type stream")
            return None
        streamcont = GetPDFStreamContents(ostream, s, crossrefs, False)
        d = ostream[1][0]
        ty = GetTypedValFromDictEntry(d, "Type", ONAME, s, crossrefs)
        if ty != "ObjStm":
            if WARN:
                print("stream object does not have Type ObjStm")
            return None
        n = GetTypedValFromDictEntry(d, "N", ONUM, s, crossrefs)
        first = GetTypedValFromDictEntry(d, "First", ONUM, s, crossrefs)
        if not n or not first:
            if WARN:
                print("required n or first not in object stream")
            return None
        i = 0
        ans = None
        for count in range(n):
            (intpair, i) = GetPDFTwoInts(streamcont, i)
            if not intpair:
                if WARN:
                    print("stream object did not find int pair at count", count)
                return None
            (id, off) = intpair
            obj = GetPDFObject(streamcont, first + off)
            if count == oindex:
                if obj:
                    ans = obj[0]
                break
        return ans
    
    
    
    def GetPDFObjFromDictEntry(d, entryname, s, crossrefs):
        """Return the PDF object that should be at given entry in d.
    
        Follow any Indirect refs until get a real object.
        """
    
        if entryname not in d:
            return None
        o = d[entryname]
        if PDFObjHasType(o, OINDIRECTREF):
            return GetPDFObjFromIndirectRef(o, s, crossrefs)
        else:
            return o
    
    
    def GetTypedValFromDictEntry(d, entryname, ty, s, crossrefs):
        """Return the value that should be found by entry in d and have type ty."""
    
        o = GetPDFObjFromDictEntry(d, entryname, s, crossrefs)
        if PDFObjHasType(o, ty):
            return o[1]
        else:
            return None
    
    
    def PDFObjHasType(o, ty):
        """Return True if o, a PDF Object, has type ty."""
    
        if o is None:
            return False
        return o[0] == ty
    
    
    def PDFDictType(d):
        """Return string value of 'Type' entry in d, '' if not there."""
    
        if 'Type' in d:
            tyobj = d['Type']
            if PDFObjHasType(tyobj, ONAME):
                return tyobj[1]
        return ''
    
    
    
    def GetPDFStreamContents(contentsobj, s, crossrefs, dodecode=True):
    
        """Return the contents of a stream object, applying any needed filters.
    
        For now, only handle FlateDecode filter, and with no DecodeParms.
    
        Args:
          contentsobj: (OSTREAM, (dict, istart, iend))
          s: bytes - PDF file contents
          crossrefs: dict - maps (obj_number, gen_number) to byte offset in s
    
          dodecode: bool - should we decode too?
    
          string - the contents (if dodecode, decoded using latin1 decoder)
    
        """
    
        if not PDFObjHasType(contentsobj, OSTREAM):
            return None
        (d, istart, _) = contentsobj[1]
        length = GetTypedValFromDictEntry(d, 'Length', ONUM, s, crossrefs)
        if length is None:
            return ''
        ans = s[istart:istart + length]
        filterobj = GetPDFObjFromDictEntry(d, 'Filter', s, crossrefs)
        if filterobj is None:
            return ans.decode()
        filters = []
        if PDFObjHasType(filterobj, ONAME):
            filters = [filterobj[1]]
        elif PDFObjHasType(filterobj, OARRAY):
            for o in filterobj[1]:
                if PDFObjHasType(o, ONAME):
                    filters.append(o[1])
        for fname in filters:
            if fname == 'FlateDecode':
                if not zlib:
                    raise RuntimeError("pdf decoding requires missing zlib module")
    
                parms = GetTypedValFromDictEntry(d, 'DecodeParms', ODICT, s, crossrefs)
                needPngPredictor = False
                columns = 1
                if parms is not None:
                    predictor = GetTypedValFromDictEntry(parms, 'Predictor', ONUM, s, crossrefs)
                    columns = GetTypedValFromDictEntry(parms, 'Columns', ONUM, s, crossrefs)
                    if predictor is not None:
                        if predictor == 1:
                            pass
                        elif predictor < 10:
                            if WARN:
                                print('unhandled predictor type', predictor)
                        else:
                            if columns is None:
                                columns = 1
                            needPngPredictor = True
    
                if needPngPredictor:
                    ansbytes = []
                    col1 = columns + 1
                    k = 0
                    currow = [0] * columns
                    while k + col1 <= len(ans):
                        if ans[k] != 2:
                            if WARN:
                                print('unhandled PNG predictor type: ', ans[k])
                        k += 1
                        for j in range(0, columns):
                            currow[j] = (currow[j] + ans[k + j]) & 0xFF
                            ansbytes.append(chr(currow[j]))
                        k += columns
                    if k != len(ans):
                        if WARN:
                            print("FlateDecode with prediction didn't consume all bytes")
                    ans = ''.join(ansbytes)
                if dodecode:
    
                    ans = ans.decode(encoding='latin1', errors='ignore')
    
            else:
                if WARN:
                    print('unhandled stream filter', fname)
                return ''
        return ans
    
    
    if __name__ == "__main__":
        if len(sys.argv) == 2:
            page1contents = ReadPDFPageOneContents(sys.argv[1])
            sys.stdout.write(page1contents)