markup: matching works
This commit is contained in:
@ -1,28 +1,20 @@
|
||||
#from __future__ import absolute_import, unicode_literals
|
||||
|
||||
import sys, os.path, re, json, pickle, subprocess
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import shutil
|
||||
|
||||
#from pprint import pprint
|
||||
#import dumper
|
||||
|
||||
#from pdfminer.psparser import PSKeyword, PSLiteral, LIT
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument #, PDFNoOutlines
|
||||
#from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError, PDFNotImplementedError
|
||||
#from pdfminer.pdftypes import dict_value, num_value, list_value
|
||||
#from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, resolve_all, stream_value
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdftypes import PDFObjRef, resolve1
|
||||
#from pdfminer.pdfpage import PDFPage
|
||||
#from pdfminer.utils import isnumber
|
||||
#from pdfminer.image import ImageWriter
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from .utils import pdf_rect
|
||||
|
||||
WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')
|
||||
|
||||
|
||||
def make_product_box(obj, pagenum):
|
||||
def make_product_box(obj, pagenum, mediabox):
|
||||
name = obj['ProCatName'].decode()
|
||||
material = obj['ProCatMaterialNumber'].decode()
|
||||
color = obj['ProCatColor'].decode()
|
||||
@ -34,7 +26,7 @@ def make_product_box(obj, pagenum):
|
||||
'name': name,
|
||||
'color': color,
|
||||
'gender': gender,
|
||||
'rect': rect,
|
||||
'rect': pdf_rect(rect, mediabox[3]),
|
||||
'page': pagenum }
|
||||
else:
|
||||
print('Annotation without rect:')
|
||||
@ -42,30 +34,28 @@ def make_product_box(obj, pagenum):
|
||||
return None
|
||||
|
||||
|
||||
def make_scribble(obj, pagenum):
|
||||
rect = obj['Rect'] # position
|
||||
#print(obj)
|
||||
def make_scribble(obj, pagenum, mediabox):
|
||||
rect = obj['Rect'] # position on page
|
||||
|
||||
# walk the object tree down to the image
|
||||
appearance = resolve1(obj['AP'])
|
||||
#print('app', appearance)
|
||||
normal_appearance = appearance['N']
|
||||
if not normal_appearance or normal_appearance.objid <= 0:
|
||||
print('skipping scribble - no normal appearance')
|
||||
return
|
||||
|
||||
normal_appearance = resolve1(normal_appearance)
|
||||
#print('norm app', normal_appearance)
|
||||
resources = resolve1(normal_appearance['Resources'])
|
||||
xobj = resolve1(resources['XObject'])
|
||||
im1 = resolve1(xobj['Im1']) # PDFStream of the image
|
||||
|
||||
flter = im1['Filter']
|
||||
if flter.name == 'JPXDecode':
|
||||
export_jp2(im1)
|
||||
path = export_jp2(im1)
|
||||
return { 'page': pagenum,
|
||||
'rect': rect,
|
||||
'objid': im1.objid }
|
||||
'rect': pdf_rect(rect, mediabox[3]),
|
||||
'objid': im1.objid,
|
||||
'image': path }
|
||||
else:
|
||||
print('skipping non-jp2 image')
|
||||
return None
|
||||
@ -94,9 +84,10 @@ def export_jp2(obj):
|
||||
os.chmod(png_path, 0o664)
|
||||
shutil.chown(png_path, group='procat')
|
||||
|
||||
return png_path
|
||||
|
||||
|
||||
def parse_pdf(fname, debug=0):
|
||||
|
||||
PDFDocument.debug = debug
|
||||
PDFParser.debug = debug
|
||||
|
||||
@ -115,6 +106,11 @@ def parse_pdf(fname, debug=0):
|
||||
page = resolve1(page)
|
||||
if not 'Annots' in page: continue
|
||||
|
||||
mediabox = page['MediaBox']
|
||||
# if 'CropBox' in page:
|
||||
# cropbox = page['CropBox']
|
||||
# print('crop',cropbox)
|
||||
|
||||
annots = page['Annots']
|
||||
if isinstance(annots, PDFObjRef):
|
||||
annots = resolve1(annots)
|
||||
@ -122,9 +118,9 @@ def parse_pdf(fname, debug=0):
|
||||
for anno in annots:
|
||||
anno = resolve1(anno)
|
||||
if 'AAPL:AKExtras' in anno:
|
||||
scribbles.append(make_scribble(anno, pagenum))
|
||||
scribbles.append(make_scribble(anno, pagenum, mediabox))
|
||||
elif 'ProCatName' in anno:
|
||||
prod_boxes.append(make_product_box(anno, pagenum))
|
||||
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
|
||||
else:
|
||||
print('ignoring other annotation')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user