move markup parsing to utils.py with a test script

2019-10-16 16:54:45 -07:00
parent 7803ae2fb1
commit 6be415a1df
4 changed files with 173 additions and 0 deletions
--- a/markup/utils.py
+++ b/markup/utils.py
@ -0,0 +1,133 @@
+#from __future__ import absolute_import, unicode_literals
+
+import sys, os.path, re, json, pickle, subprocess
+import shutil
+
+#from pprint import pprint
+#import dumper
+
+#from pdfminer.psparser import PSKeyword, PSLiteral, LIT
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument #, PDFNoOutlines
+#from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError, PDFNotImplementedError
+#from pdfminer.pdftypes import dict_value, num_value, list_value
+#from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, resolve_all, stream_value
+from pdfminer.pdftypes import PDFObjRef, resolve1
+#from pdfminer.pdfpage import PDFPage
+#from pdfminer.utils import isnumber
+#from pdfminer.image import ImageWriter
+
+from django.conf import settings
+
+WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')
+
+
+def make_product_box(obj, pagenum):
+    name = obj['ProCatName'].decode()
+    material = obj['ProCatMaterialNumber'].decode()
+    color = obj['ProCatColor'].decode()
+    gender = obj['ProCatGender'].decode()
+    rect = obj['Rect']
+
+    if rect:
+        return { 'material': material,
+                 'name': name,
+                 'color': color,
+                 'gender': gender,
+                 'rect': rect,
+                 'page': pagenum }
+    else:
+        print('Annotation without rect:')
+        print(dumper.dump(obj))
+        return None
+
+
+def make_scribble(obj, pagenum):
+    rect = obj['Rect'] # position
+    #print(obj)
+
+    # walk the object tree down to the image
+    appearance = resolve1(obj['AP'])
+    #print('app', appearance)
+    normal_appearance = appearance['N']
+    if not normal_appearance or normal_appearance.objid <= 0:
+        print('skipping scribble - no normal appearance')
+        return
+
+    normal_appearance = resolve1(normal_appearance)
+    #print('norm app', normal_appearance)
+    resources = resolve1(normal_appearance['Resources'])
+    xobj = resolve1(resources['XObject'])
+    im1 = resolve1(xobj['Im1']) # PDFStream of the image
+
+    flter = im1['Filter']
+    if flter.name == 'JPXDecode':
+        export_jp2(im1)
+        return { 'page': pagenum,
+                 'rect': rect,
+                 'objid': im1.objid }
+    else:
+        print('skipping non-jp2 image')
+        return None
+
+
+def export_jp2(obj):
+    jp2_path = os.path.join(WORKDIR, "export-{}.jp2".format(obj.objid))
+    png_path = os.path.join(WORKDIR, "export-{}.png".format(obj.objid))
+
+    if not os.path.exists(WORKDIR):
+        os.makedirs(WORKDIR)
+        os.chmod(WORKDIR, 0o775)
+        shutil.chown(WORKDIR, group='procat')
+
+    data = obj.get_rawdata()
+    print('extracting jp2: {}'.format(jp2_path))
+    with open(jp2_path, 'wb') as out:
+        out.write(data)
+        os.chmod(jp2_path, 0o664)
+        shutil.chown(jp2_path, group='procat')
+
+    result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True)
+    if result.returncode != 0:
+        print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode()))
+    else:
+        os.chmod(png_path, 0o664)
+        shutil.chown(png_path, group='procat')
+
+
+def parse_pdf(fname, debug=0):
+
+    PDFDocument.debug = debug
+    PDFParser.debug = debug
+
+    fp = open(fname, 'rb')
+    parser = PDFParser(fp)
+    doc = PDFDocument(parser)
+
+    prod_boxes = []
+    scribbles = []
+
+    page_dict = resolve1(doc.catalog['Pages'])
+    pages = resolve1(page_dict['Kids'])
+    pagenum = 0
+    for page in pages:
+        pagenum += 1
+        page = resolve1(page)
+        if not 'Annots' in page: continue
+
+        annots = page['Annots']
+        if isinstance(annots, PDFObjRef):
+            annots = resolve1(annots)
+
+        for anno in annots:
+            anno = resolve1(anno)
+            if 'AAPL:AKExtras' in anno:
+                scribbles.append(make_scribble(anno, pagenum))
+            elif 'ProCatName' in anno:
+                prod_boxes.append(make_product_box(anno, pagenum))
+            else:
+                print('ignoring other annotation')
+
+    fp.close()
+
+    return [prod_boxes, scribbles]
--- a/markup/work/init.py
+++ b/markup/work/init.py
--- a/markup/work/test_pdf.py
+++ b/markup/work/test_pdf.py
@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+import sys
+import os
+import inspect
+
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(currentdir)
+parentparentdir = os.path.dirname(parentdir)
+sys.path.insert(0, parentparentdir) 
+
+import dumper
+import getopt
+import django
+from django.conf import settings
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'procat2.settings')
+django.setup()
+
+from markup.utils import parse_pdf
+from procat2.settings import ASSET_DIR
+
+
+def main(argv):
+    def usage():
+        print('usage: %s [-d] file ...' % argv[0])
+        return 100
+    try:
+        (opts, args) = getopt.getopt(argv[1:], 'd')
+    except getopt.GetoptError:
+        return usage()
+    if not args: return usage()
+    debug = 0
+    for (k, v) in opts:
+        if k == '-d': debug += 1
+
+    parse_pdf(args[0], debug)
+
+if __name__ == '__main__': sys.exit(main(sys.argv))