markup: matching works

2019-10-18 13:11:53 -07:00
parent be2902ca24
commit 5282f7cb2f
4 changed files with 169 additions and 32 deletions
--- a/markup/pdf.py
+++ b/markup/pdf.py
@ -1,28 +1,20 @@
-#from __future__ import absolute_import, unicode_literals
-
-import sys, os.path, re, json, pickle, subprocess
+import os
+import sys
+import subprocess
 import shutil

-#from pprint import pprint
-#import dumper
-
-#from pdfminer.psparser import PSKeyword, PSLiteral, LIT
 from pdfminer.pdfparser import PDFParser
-from pdfminer.pdfdocument import PDFDocument #, PDFNoOutlines
-#from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError, PDFNotImplementedError
-#from pdfminer.pdftypes import dict_value, num_value, list_value
-#from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, resolve_all, stream_value
+from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdftypes import PDFObjRef, resolve1
-#from pdfminer.pdfpage import PDFPage
-#from pdfminer.utils import isnumber
-#from pdfminer.image import ImageWriter

 from django.conf import settings

+from .utils import pdf_rect
+
 WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')


-def make_product_box(obj, pagenum):
+def make_product_box(obj, pagenum, mediabox):
    name = obj['ProCatName'].decode()
    material = obj['ProCatMaterialNumber'].decode()
    color = obj['ProCatColor'].decode()
@ -34,7 +26,7 @@ def make_product_box(obj, pagenum):
                 'name': name,
                 'color': color,
                 'gender': gender,
-                 'rect': rect,
+                 'rect': pdf_rect(rect, mediabox[3]),
                 'page': pagenum }
    else:
        print('Annotation without rect:')
@ -42,30 +34,28 @@ def make_product_box(obj, pagenum):
        return None


-def make_scribble(obj, pagenum):
-    rect = obj['Rect'] # position
-    #print(obj)
+def make_scribble(obj, pagenum, mediabox):
+    rect = obj['Rect'] # position on page

    # walk the object tree down to the image
    appearance = resolve1(obj['AP'])
-    #print('app', appearance)
    normal_appearance = appearance['N']
    if not normal_appearance or normal_appearance.objid <= 0:
        print('skipping scribble - no normal appearance')
        return

    normal_appearance = resolve1(normal_appearance)
-    #print('norm app', normal_appearance)
    resources = resolve1(normal_appearance['Resources'])
    xobj = resolve1(resources['XObject'])
    im1 = resolve1(xobj['Im1']) # PDFStream of the image

    flter = im1['Filter']
    if flter.name == 'JPXDecode':
-        export_jp2(im1)
+        path = export_jp2(im1)
        return { 'page': pagenum,
-                 'rect': rect,
-                 'objid': im1.objid }
+                 'rect': pdf_rect(rect, mediabox[3]),
+                 'objid': im1.objid,
+                 'image': path }
    else:
        print('skipping non-jp2 image')
        return None
@ -94,9 +84,10 @@ def export_jp2(obj):
        os.chmod(png_path, 0o664)
        shutil.chown(png_path, group='procat')

+    return png_path
+

 def parse_pdf(fname, debug=0):
-
    PDFDocument.debug = debug
    PDFParser.debug = debug

@ -115,6 +106,11 @@ def parse_pdf(fname, debug=0):
        page = resolve1(page)
        if not 'Annots' in page: continue

+        mediabox = page['MediaBox']
+        # if 'CropBox' in page:
+        #     cropbox = page['CropBox']
+        #     print('crop',cropbox)
+
        annots = page['Annots']
        if isinstance(annots, PDFObjRef):
            annots = resolve1(annots)
@ -122,9 +118,9 @@ def parse_pdf(fname, debug=0):
        for anno in annots:
            anno = resolve1(anno)
            if 'AAPL:AKExtras' in anno:
-                scribbles.append(make_scribble(anno, pagenum))
+                scribbles.append(make_scribble(anno, pagenum, mediabox))
            elif 'ProCatName' in anno:
-                prod_boxes.append(make_product_box(anno, pagenum))
+                prod_boxes.append(make_product_box(anno, pagenum, mediabox))
            else:
                print('ignoring other annotation')