#from __future__ import absolute_import, unicode_literals import sys, os.path, re, json, pickle, subprocess import shutil #from pprint import pprint #import dumper #from pdfminer.psparser import PSKeyword, PSLiteral, LIT from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument #, PDFNoOutlines #from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError, PDFNotImplementedError #from pdfminer.pdftypes import dict_value, num_value, list_value #from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, resolve_all, stream_value from pdfminer.pdftypes import PDFObjRef, resolve1 #from pdfminer.pdfpage import PDFPage #from pdfminer.utils import isnumber #from pdfminer.image import ImageWriter from django.conf import settings WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work') def make_product_box(obj, pagenum): name = obj['ProCatName'].decode() material = obj['ProCatMaterialNumber'].decode() color = obj['ProCatColor'].decode() gender = obj['ProCatGender'].decode() rect = obj['Rect'] if rect: return { 'material': material, 'name': name, 'color': color, 'gender': gender, 'rect': rect, 'page': pagenum } else: print('Annotation without rect:') print(dumper.dump(obj)) return None def make_scribble(obj, pagenum): rect = obj['Rect'] # position #print(obj) # walk the object tree down to the image appearance = resolve1(obj['AP']) #print('app', appearance) normal_appearance = appearance['N'] if not normal_appearance or normal_appearance.objid <= 0: print('skipping scribble - no normal appearance') return normal_appearance = resolve1(normal_appearance) #print('norm app', normal_appearance) resources = resolve1(normal_appearance['Resources']) xobj = resolve1(resources['XObject']) im1 = resolve1(xobj['Im1']) # PDFStream of the image flter = im1['Filter'] if flter.name == 'JPXDecode': export_jp2(im1) return { 'page': pagenum, 'rect': rect, 'objid': im1.objid } else: print('skipping non-jp2 image') return None def export_jp2(obj): jp2_path = os.path.join(WORKDIR, "export-{}.jp2".format(obj.objid)) png_path = os.path.join(WORKDIR, "export-{}.png".format(obj.objid)) if not os.path.exists(WORKDIR): os.makedirs(WORKDIR) os.chmod(WORKDIR, 0o775) shutil.chown(WORKDIR, group='procat') data = obj.get_rawdata() print('extracting jp2: {}'.format(jp2_path)) with open(jp2_path, 'wb') as out: out.write(data) os.chmod(jp2_path, 0o664) shutil.chown(jp2_path, group='procat') result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True) if result.returncode != 0: print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode())) else: os.chmod(png_path, 0o664) shutil.chown(png_path, group='procat') def parse_pdf(fname, debug=0): PDFDocument.debug = debug PDFParser.debug = debug fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) prod_boxes = [] scribbles = [] page_dict = resolve1(doc.catalog['Pages']) pages = resolve1(page_dict['Kids']) pagenum = 0 for page in pages: pagenum += 1 page = resolve1(page) if not 'Annots' in page: continue annots = page['Annots'] if isinstance(annots, PDFObjRef): annots = resolve1(annots) for anno in annots: anno = resolve1(anno) if 'AAPL:AKExtras' in anno: scribbles.append(make_scribble(anno, pagenum)) elif 'ProCatName' in anno: prod_boxes.append(make_product_box(anno, pagenum)) else: print('ignoring other annotation') fp.close() return [prod_boxes, scribbles]