move markup parsing to utils.py with a test script

This commit is contained in:
2019-10-16 16:54:45 -07:00
parent 7803ae2fb1
commit 6be415a1df
4 changed files with 173 additions and 0 deletions

133
markup/utils.py Normal file
View File

@ -0,0 +1,133 @@
#from __future__ import absolute_import, unicode_literals
import sys, os.path, re, json, pickle, subprocess
import shutil
#from pprint import pprint
#import dumper
#from pdfminer.psparser import PSKeyword, PSLiteral, LIT
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument #, PDFNoOutlines
#from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError, PDFNotImplementedError
#from pdfminer.pdftypes import dict_value, num_value, list_value
#from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, resolve_all, stream_value
from pdfminer.pdftypes import PDFObjRef, resolve1
#from pdfminer.pdfpage import PDFPage
#from pdfminer.utils import isnumber
#from pdfminer.image import ImageWriter
from django.conf import settings
WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')
def make_product_box(obj, pagenum):
name = obj['ProCatName'].decode()
material = obj['ProCatMaterialNumber'].decode()
color = obj['ProCatColor'].decode()
gender = obj['ProCatGender'].decode()
rect = obj['Rect']
if rect:
return { 'material': material,
'name': name,
'color': color,
'gender': gender,
'rect': rect,
'page': pagenum }
else:
print('Annotation without rect:')
print(dumper.dump(obj))
return None
def make_scribble(obj, pagenum):
rect = obj['Rect'] # position
#print(obj)
# walk the object tree down to the image
appearance = resolve1(obj['AP'])
#print('app', appearance)
normal_appearance = appearance['N']
if not normal_appearance or normal_appearance.objid <= 0:
print('skipping scribble - no normal appearance')
return
normal_appearance = resolve1(normal_appearance)
#print('norm app', normal_appearance)
resources = resolve1(normal_appearance['Resources'])
xobj = resolve1(resources['XObject'])
im1 = resolve1(xobj['Im1']) # PDFStream of the image
flter = im1['Filter']
if flter.name == 'JPXDecode':
export_jp2(im1)
return { 'page': pagenum,
'rect': rect,
'objid': im1.objid }
else:
print('skipping non-jp2 image')
return None
def export_jp2(obj):
jp2_path = os.path.join(WORKDIR, "export-{}.jp2".format(obj.objid))
png_path = os.path.join(WORKDIR, "export-{}.png".format(obj.objid))
if not os.path.exists(WORKDIR):
os.makedirs(WORKDIR)
os.chmod(WORKDIR, 0o775)
shutil.chown(WORKDIR, group='procat')
data = obj.get_rawdata()
print('extracting jp2: {}'.format(jp2_path))
with open(jp2_path, 'wb') as out:
out.write(data)
os.chmod(jp2_path, 0o664)
shutil.chown(jp2_path, group='procat')
result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True)
if result.returncode != 0:
print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode()))
else:
os.chmod(png_path, 0o664)
shutil.chown(png_path, group='procat')
def parse_pdf(fname, debug=0):
PDFDocument.debug = debug
PDFParser.debug = debug
fp = open(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
prod_boxes = []
scribbles = []
page_dict = resolve1(doc.catalog['Pages'])
pages = resolve1(page_dict['Kids'])
pagenum = 0
for page in pages:
pagenum += 1
page = resolve1(page)
if not 'Annots' in page: continue
annots = page['Annots']
if isinstance(annots, PDFObjRef):
annots = resolve1(annots)
for anno in annots:
anno = resolve1(anno)
if 'AAPL:AKExtras' in anno:
scribbles.append(make_scribble(anno, pagenum))
elif 'ProCatName' in anno:
prod_boxes.append(make_product_box(anno, pagenum))
else:
print('ignoring other annotation')
fp.close()
return [prod_boxes, scribbles]

0
markup/work/__init__.py Normal file
View File

38
markup/work/test_pdf.py Executable file
View File

@ -0,0 +1,38 @@
#!/usr/bin/env python3
import sys
import os
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
parentparentdir = os.path.dirname(parentdir)
sys.path.insert(0, parentparentdir)
import dumper
import getopt
import django
from django.conf import settings
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'procat2.settings')
django.setup()
from markup.utils import parse_pdf
from procat2.settings import ASSET_DIR
def main(argv):
def usage():
print('usage: %s [-d] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'd')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = 0
for (k, v) in opts:
if k == '-d': debug += 1
parse_pdf(args[0], debug)
if __name__ == '__main__': sys.exit(main(sys.argv))