move markup parsing to utils.py with a test script
This commit is contained in:
133
markup/utils.py
Normal file
133
markup/utils.py
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
#from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
|
import sys, os.path, re, json, pickle, subprocess
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
#from pprint import pprint
|
||||||
|
#import dumper
|
||||||
|
|
||||||
|
#from pdfminer.psparser import PSKeyword, PSLiteral, LIT
|
||||||
|
from pdfminer.pdfparser import PDFParser
|
||||||
|
from pdfminer.pdfdocument import PDFDocument #, PDFNoOutlines
|
||||||
|
#from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError, PDFNotImplementedError
|
||||||
|
#from pdfminer.pdftypes import dict_value, num_value, list_value
|
||||||
|
#from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, resolve_all, stream_value
|
||||||
|
from pdfminer.pdftypes import PDFObjRef, resolve1
|
||||||
|
#from pdfminer.pdfpage import PDFPage
|
||||||
|
#from pdfminer.utils import isnumber
|
||||||
|
#from pdfminer.image import ImageWriter
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')
|
||||||
|
|
||||||
|
|
||||||
|
def make_product_box(obj, pagenum):
|
||||||
|
name = obj['ProCatName'].decode()
|
||||||
|
material = obj['ProCatMaterialNumber'].decode()
|
||||||
|
color = obj['ProCatColor'].decode()
|
||||||
|
gender = obj['ProCatGender'].decode()
|
||||||
|
rect = obj['Rect']
|
||||||
|
|
||||||
|
if rect:
|
||||||
|
return { 'material': material,
|
||||||
|
'name': name,
|
||||||
|
'color': color,
|
||||||
|
'gender': gender,
|
||||||
|
'rect': rect,
|
||||||
|
'page': pagenum }
|
||||||
|
else:
|
||||||
|
print('Annotation without rect:')
|
||||||
|
print(dumper.dump(obj))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def make_scribble(obj, pagenum):
|
||||||
|
rect = obj['Rect'] # position
|
||||||
|
#print(obj)
|
||||||
|
|
||||||
|
# walk the object tree down to the image
|
||||||
|
appearance = resolve1(obj['AP'])
|
||||||
|
#print('app', appearance)
|
||||||
|
normal_appearance = appearance['N']
|
||||||
|
if not normal_appearance or normal_appearance.objid <= 0:
|
||||||
|
print('skipping scribble - no normal appearance')
|
||||||
|
return
|
||||||
|
|
||||||
|
normal_appearance = resolve1(normal_appearance)
|
||||||
|
#print('norm app', normal_appearance)
|
||||||
|
resources = resolve1(normal_appearance['Resources'])
|
||||||
|
xobj = resolve1(resources['XObject'])
|
||||||
|
im1 = resolve1(xobj['Im1']) # PDFStream of the image
|
||||||
|
|
||||||
|
flter = im1['Filter']
|
||||||
|
if flter.name == 'JPXDecode':
|
||||||
|
export_jp2(im1)
|
||||||
|
return { 'page': pagenum,
|
||||||
|
'rect': rect,
|
||||||
|
'objid': im1.objid }
|
||||||
|
else:
|
||||||
|
print('skipping non-jp2 image')
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def export_jp2(obj):
|
||||||
|
jp2_path = os.path.join(WORKDIR, "export-{}.jp2".format(obj.objid))
|
||||||
|
png_path = os.path.join(WORKDIR, "export-{}.png".format(obj.objid))
|
||||||
|
|
||||||
|
if not os.path.exists(WORKDIR):
|
||||||
|
os.makedirs(WORKDIR)
|
||||||
|
os.chmod(WORKDIR, 0o775)
|
||||||
|
shutil.chown(WORKDIR, group='procat')
|
||||||
|
|
||||||
|
data = obj.get_rawdata()
|
||||||
|
print('extracting jp2: {}'.format(jp2_path))
|
||||||
|
with open(jp2_path, 'wb') as out:
|
||||||
|
out.write(data)
|
||||||
|
os.chmod(jp2_path, 0o664)
|
||||||
|
shutil.chown(jp2_path, group='procat')
|
||||||
|
|
||||||
|
result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True)
|
||||||
|
if result.returncode != 0:
|
||||||
|
print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode()))
|
||||||
|
else:
|
||||||
|
os.chmod(png_path, 0o664)
|
||||||
|
shutil.chown(png_path, group='procat')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_pdf(fname, debug=0):
|
||||||
|
|
||||||
|
PDFDocument.debug = debug
|
||||||
|
PDFParser.debug = debug
|
||||||
|
|
||||||
|
fp = open(fname, 'rb')
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
doc = PDFDocument(parser)
|
||||||
|
|
||||||
|
prod_boxes = []
|
||||||
|
scribbles = []
|
||||||
|
|
||||||
|
page_dict = resolve1(doc.catalog['Pages'])
|
||||||
|
pages = resolve1(page_dict['Kids'])
|
||||||
|
pagenum = 0
|
||||||
|
for page in pages:
|
||||||
|
pagenum += 1
|
||||||
|
page = resolve1(page)
|
||||||
|
if not 'Annots' in page: continue
|
||||||
|
|
||||||
|
annots = page['Annots']
|
||||||
|
if isinstance(annots, PDFObjRef):
|
||||||
|
annots = resolve1(annots)
|
||||||
|
|
||||||
|
for anno in annots:
|
||||||
|
anno = resolve1(anno)
|
||||||
|
if 'AAPL:AKExtras' in anno:
|
||||||
|
scribbles.append(make_scribble(anno, pagenum))
|
||||||
|
elif 'ProCatName' in anno:
|
||||||
|
prod_boxes.append(make_product_box(anno, pagenum))
|
||||||
|
else:
|
||||||
|
print('ignoring other annotation')
|
||||||
|
|
||||||
|
fp.close()
|
||||||
|
|
||||||
|
return [prod_boxes, scribbles]
|
||||||
0
markup/work/__init__.py
Normal file
0
markup/work/__init__.py
Normal file
38
markup/work/test_pdf.py
Executable file
38
markup/work/test_pdf.py
Executable file
@ -0,0 +1,38 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
||||||
|
parentdir = os.path.dirname(currentdir)
|
||||||
|
parentparentdir = os.path.dirname(parentdir)
|
||||||
|
sys.path.insert(0, parentparentdir)
|
||||||
|
|
||||||
|
import dumper
|
||||||
|
import getopt
|
||||||
|
import django
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'procat2.settings')
|
||||||
|
django.setup()
|
||||||
|
|
||||||
|
from markup.utils import parse_pdf
|
||||||
|
from procat2.settings import ASSET_DIR
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv):
|
||||||
|
def usage():
|
||||||
|
print('usage: %s [-d] file ...' % argv[0])
|
||||||
|
return 100
|
||||||
|
try:
|
||||||
|
(opts, args) = getopt.getopt(argv[1:], 'd')
|
||||||
|
except getopt.GetoptError:
|
||||||
|
return usage()
|
||||||
|
if not args: return usage()
|
||||||
|
debug = 0
|
||||||
|
for (k, v) in opts:
|
||||||
|
if k == '-d': debug += 1
|
||||||
|
|
||||||
|
parse_pdf(args[0], debug)
|
||||||
|
|
||||||
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
@ -12,6 +12,7 @@ django-lazysignup==2.0.0
|
|||||||
django-settings-export==1.2.1
|
django-settings-export==1.2.1
|
||||||
django-user-accounts==2.1.0
|
django-user-accounts==2.1.0
|
||||||
django-webpack-loader==0.6.0
|
django-webpack-loader==0.6.0
|
||||||
|
Dumper==1.2.0
|
||||||
humanize==0.5.1
|
humanize==0.5.1
|
||||||
importlib-metadata==0.23
|
importlib-metadata==0.23
|
||||||
ipdb==0.11
|
ipdb==0.11
|
||||||
@ -21,6 +22,7 @@ jedi==0.13.3
|
|||||||
kombu==4.6.5
|
kombu==4.6.5
|
||||||
more-itertools==7.2.0
|
more-itertools==7.2.0
|
||||||
parso==0.3.4
|
parso==0.3.4
|
||||||
|
pdfminer==20191010
|
||||||
pexpect==4.6.0
|
pexpect==4.6.0
|
||||||
pickleshare==0.7.5
|
pickleshare==0.7.5
|
||||||
prompt-toolkit==2.0.9
|
prompt-toolkit==2.0.9
|
||||||
|
|||||||
Reference in New Issue
Block a user