add scribble image processing

2019-10-16 23:55:42 -07:00
parent 6be415a1df
commit 68a658dfe8
5 changed files with 157 additions and 6 deletions
--- a/markup/utils.py
+++ b/markup/utils.py
@ -1,133 +0,0 @@
-#from __future__ import absolute_import, unicode_literals
-
-import sys, os.path, re, json, pickle, subprocess
-import shutil
-
-#from pprint import pprint
-#import dumper
-
-#from pdfminer.psparser import PSKeyword, PSLiteral, LIT
-from pdfminer.pdfparser import PDFParser
-from pdfminer.pdfdocument import PDFDocument #, PDFNoOutlines
-#from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError, PDFNotImplementedError
-#from pdfminer.pdftypes import dict_value, num_value, list_value
-#from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, resolve_all, stream_value
-from pdfminer.pdftypes import PDFObjRef, resolve1
-#from pdfminer.pdfpage import PDFPage
-#from pdfminer.utils import isnumber
-#from pdfminer.image import ImageWriter
-
-from django.conf import settings
-
-WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')
-
-
-def make_product_box(obj, pagenum):
-    name = obj['ProCatName'].decode()
-    material = obj['ProCatMaterialNumber'].decode()
-    color = obj['ProCatColor'].decode()
-    gender = obj['ProCatGender'].decode()
-    rect = obj['Rect']
-
-    if rect:
-        return { 'material': material,
-                 'name': name,
-                 'color': color,
-                 'gender': gender,
-                 'rect': rect,
-                 'page': pagenum }
-    else:
-        print('Annotation without rect:')
-        print(dumper.dump(obj))
-        return None
-
-
-def make_scribble(obj, pagenum):
-    rect = obj['Rect'] # position
-    #print(obj)
-
-    # walk the object tree down to the image
-    appearance = resolve1(obj['AP'])
-    #print('app', appearance)
-    normal_appearance = appearance['N']
-    if not normal_appearance or normal_appearance.objid <= 0:
-        print('skipping scribble - no normal appearance')
-        return
-
-    normal_appearance = resolve1(normal_appearance)
-    #print('norm app', normal_appearance)
-    resources = resolve1(normal_appearance['Resources'])
-    xobj = resolve1(resources['XObject'])
-    im1 = resolve1(xobj['Im1']) # PDFStream of the image
-
-    flter = im1['Filter']
-    if flter.name == 'JPXDecode':
-        export_jp2(im1)
-        return { 'page': pagenum,
-                 'rect': rect,
-                 'objid': im1.objid }
-    else:
-        print('skipping non-jp2 image')
-        return None
-
-
-def export_jp2(obj):
-    jp2_path = os.path.join(WORKDIR, "export-{}.jp2".format(obj.objid))
-    png_path = os.path.join(WORKDIR, "export-{}.png".format(obj.objid))
-
-    if not os.path.exists(WORKDIR):
-        os.makedirs(WORKDIR)
-        os.chmod(WORKDIR, 0o775)
-        shutil.chown(WORKDIR, group='procat')
-
-    data = obj.get_rawdata()
-    print('extracting jp2: {}'.format(jp2_path))
-    with open(jp2_path, 'wb') as out:
-        out.write(data)
-        os.chmod(jp2_path, 0o664)
-        shutil.chown(jp2_path, group='procat')
-
-    result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True)
-    if result.returncode != 0:
-        print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode()))
-    else:
-        os.chmod(png_path, 0o664)
-        shutil.chown(png_path, group='procat')
-
-
-def parse_pdf(fname, debug=0):
-
-    PDFDocument.debug = debug
-    PDFParser.debug = debug
-
-    fp = open(fname, 'rb')
-    parser = PDFParser(fp)
-    doc = PDFDocument(parser)
-
-    prod_boxes = []
-    scribbles = []
-
-    page_dict = resolve1(doc.catalog['Pages'])
-    pages = resolve1(page_dict['Kids'])
-    pagenum = 0
-    for page in pages:
-        pagenum += 1
-        page = resolve1(page)
-        if not 'Annots' in page: continue
-
-        annots = page['Annots']
-        if isinstance(annots, PDFObjRef):
-            annots = resolve1(annots)
-
-        for anno in annots:
-            anno = resolve1(anno)
-            if 'AAPL:AKExtras' in anno:
-                scribbles.append(make_scribble(anno, pagenum))
-            elif 'ProCatName' in anno:
-                prod_boxes.append(make_product_box(anno, pagenum))
-            else:
-                print('ignoring other annotation')
-
-    fp.close()
-
-    return [prod_boxes, scribbles]