Files
procat2/markup/pdf.py

124 lines
3.8 KiB
Python

import os
import sys
import subprocess
import shutil
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import PDFObjRef, resolve1
from .utils import pdf_rect, ensure_dir, set_file_perms
def make_product_box(obj, pagenum, mediabox):
rect = obj['Rect']
if rect:
name = obj['ProCatName'].decode() if 'ProCatName' in obj else ''
material = obj['ProCatMaterialNumber'].decode() if 'ProCatMaterialNumber' in obj else ''
color = obj['ProCatColor'].decode() if 'ProCatColor' in obj else ''
gender = obj['ProCatGender'].decode() if 'ProCatGender' in obj else ''
season = obj['ProCatSeason'].decode() if 'ProCatSeason' in obj else ''
return { 'material': material,
'name': name,
'color': color,
'gender': gender,
'season': season,
'rect': pdf_rect(rect, mediabox[3]),
'page': pagenum }
else:
print('Annotation without rect:')
print(dumper.dump(obj))
return None
def make_scribble(obj, pagenum, mediabox, workdir):
rect = obj['Rect'] # position on page
# walk the object tree down to the image
appearance = resolve1(obj['AP'])
normal_appearance = appearance['N']
if not normal_appearance or normal_appearance.objid <= 0:
print('skipping scribble - no normal appearance')
return
normal_appearance = resolve1(normal_appearance)
resources = resolve1(normal_appearance['Resources'])
xobj = resolve1(resources['XObject'])
im1 = resolve1(xobj['Im1']) # PDFStream of the image
flter = im1['Filter']
if flter.name == 'JPXDecode':
path = export_jp2(im1, workdir, pagenum)
return { 'page': pagenum,
'rect': pdf_rect(rect, mediabox[3]),
'objid': im1.objid,
'image': path }
else:
print('skipping non-jp2 image')
return None
def export_jp2(obj, workdir, pagenum):
oid = obj.objid
ensure_dir(workdir)
jp2_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.jp2")
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.png")
data = obj.get_rawdata()
print('extracting jp2: {}'.format(jp2_path))
with open(jp2_path, 'wb') as out:
out.write(data)
set_file_perms(jp2_path)
result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True)
if result.returncode != 0:
print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode()))
else:
set_file_perms(png_path)
return png_path
def parse_pdf(fname, workdir, debug=0):
PDFDocument.debug = debug
PDFParser.debug = debug
fp = open(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
prod_boxes = []
scribbles = []
page_dict = resolve1(doc.catalog['Pages'])
pages = resolve1(page_dict['Kids'])
pagenum = 0
for page in pages:
pagenum += 1
page = resolve1(page)
if not 'Annots' in page: continue
mediabox = page['MediaBox']
# if 'CropBox' in page:
# cropbox = page['CropBox']
# print('crop',cropbox)
annots = page['Annots']
if isinstance(annots, PDFObjRef):
annots = resolve1(annots)
for anno in annots:
anno = resolve1(anno)
if 'AAPL:AKExtras' in anno:
scribbles.append(make_scribble(anno, pagenum, mediabox, workdir))
elif 'ProCatName' in anno:
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
else:
print('ignoring other annotation')
fp.close()
return [list(filter(None, prod_boxes)), list(filter(None, scribbles))]