231 lines
7.2 KiB
Python
231 lines
7.2 KiB
Python
import os
|
|
import sys
|
|
import subprocess
|
|
import shutil
|
|
import dumper
|
|
|
|
from pdfminer.psparser import LIT
|
|
from pdfminer.pdfparser import PDFParser
|
|
from pdfminer.pdfdocument import PDFDocument
|
|
from pdfminer.pdftypes import PDFObjRef, resolve1
|
|
|
|
from .utils import Rect, pdf_rect, ensure_dir, set_file_perms
|
|
from .img import write_inklist, write_square_or_circle
|
|
|
|
|
|
def make_product_box(obj, pagenum, mediabox):
|
|
rect = obj['Rect']
|
|
|
|
if rect:
|
|
name = obj['ProCatName'].decode() if 'ProCatName' in obj else ''
|
|
material = obj['ProCatMaterialNumber'].decode() if 'ProCatMaterialNumber' in obj else ''
|
|
color = obj['ProCatColor'].decode() if 'ProCatColor' in obj else ''
|
|
gender = obj['ProCatGender'].decode() if 'ProCatGender' in obj else ''
|
|
season = obj['ProCatSeason'].decode() if 'ProCatSeason' in obj else ''
|
|
size = obj['ProCatSize'].decode() if 'ProCatSize' in obj else ''
|
|
category = obj['ProCatCategory'].decode() if 'ProCatCategory' in obj else ''
|
|
|
|
return { 'material': material,
|
|
'name': name,
|
|
'color': color,
|
|
'gender': gender,
|
|
'season': season,
|
|
'size': size,
|
|
'category': category,
|
|
'rect': pdf_rect(rect, mediabox[3]),
|
|
'page': pagenum }
|
|
else:
|
|
print('Annotation without rect:')
|
|
print(dumper.dump(obj))
|
|
return None
|
|
|
|
|
|
def make_ink_scribble(obj, pagenum, mediabox, workdir):
|
|
oid = obj['NM'].decode('utf-8')
|
|
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png")
|
|
|
|
write_inklist(obj, mediabox, png_path)
|
|
|
|
return { 'page': pagenum,
|
|
'rect': Rect(*mediabox),
|
|
'objid': oid,
|
|
'image': png_path }
|
|
|
|
|
|
def make_square_or_circle_scribble(obj, pagenum, mediabox, workdir):
|
|
oid = obj['NM'].decode('utf-8')
|
|
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png")
|
|
|
|
write_square_or_circle(obj, mediabox, png_path)
|
|
|
|
return { 'page': pagenum,
|
|
'rect': Rect(*mediabox),
|
|
'objid': oid,
|
|
'image': png_path }
|
|
|
|
|
|
def make_aapl_scribble(obj, pagenum, mediabox, workdir):
|
|
rect = obj['Rect'] # position on page
|
|
|
|
# walk the object tree down to the image
|
|
appearance = resolve1(obj['AP'])
|
|
normal_appearance = appearance['N']
|
|
if not normal_appearance or normal_appearance.objid <= 0:
|
|
print('skipping scribble - no normal appearance')
|
|
return
|
|
|
|
normal_appearance = resolve1(normal_appearance)
|
|
resources = resolve1(normal_appearance['Resources'])
|
|
xobj = resolve1(resources['XObject'])
|
|
im1 = resolve1(xobj['Im1']) # PDFStream of the image
|
|
|
|
flter = im1['Filter']
|
|
if flter.name == 'JPXDecode':
|
|
path = export_jp2(im1, workdir, pagenum)
|
|
return { 'page': pagenum,
|
|
'rect': pdf_rect(rect, mediabox[3]),
|
|
'objid': im1.objid,
|
|
'image': path }
|
|
elif flter.name == 'FlateDecode':
|
|
path = export_netpbm(im1, workdir, pagenum)
|
|
return { 'page': pagenum,
|
|
'rect': pdf_rect(rect, mediabox[3]),
|
|
'objid': im1.objid,
|
|
'image': path }
|
|
else:
|
|
print('skipping unrecognized image')
|
|
# print(dumper.dump(im1))
|
|
return None
|
|
|
|
|
|
def export_jp2(obj, workdir, pagenum):
|
|
oid = obj.objid
|
|
ensure_dir(workdir)
|
|
jp2_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.jp2")
|
|
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.png")
|
|
|
|
data = obj.get_rawdata()
|
|
print('extracting jp2: {}'.format(jp2_path))
|
|
with open(jp2_path, 'wb') as out:
|
|
out.write(data)
|
|
set_file_perms(jp2_path)
|
|
|
|
result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True)
|
|
if result.returncode != 0:
|
|
print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode()))
|
|
else:
|
|
set_file_perms(png_path)
|
|
|
|
return png_path
|
|
|
|
|
|
def export_netpbm(obj, workdir, pagenum):
|
|
oid = obj.objid
|
|
ensure_dir(workdir)
|
|
|
|
pbm_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}")
|
|
pbm_path = write_pbm(obj, pbm_base)
|
|
|
|
# stencil mask - use instead if present
|
|
smask = obj.attrs['SMask']
|
|
if smask:
|
|
print('extracting pbm mask')
|
|
mask = resolve1(smask)
|
|
mask_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}-mask")
|
|
mask_path = write_pbm(smask, mask_base)
|
|
pbm_path = mask_path
|
|
|
|
return pbm_path
|
|
|
|
|
|
def write_pbm(obj, base_path):
|
|
obj = resolve1(obj)
|
|
color_space = resolve1(obj.attrs['ColorSpace'])
|
|
|
|
suffix = '.pgm' if color_space == LIT('DeviceGray') else '.ppm'
|
|
path = base_path + suffix
|
|
|
|
print('writing pbm: {}'.format(path))
|
|
|
|
data = obj.get_data()
|
|
with open(path, 'wb') as out:
|
|
if suffix == '.pgm':
|
|
out.write("P5\n".encode())
|
|
else:
|
|
out.write("P6\n".encode())
|
|
|
|
out.write("{} {}\n".format(obj.attrs['Width'], obj.attrs['Height']).encode())
|
|
|
|
if obj.attrs['BitsPerComponent'] == 8:
|
|
out.write("255\n".encode())
|
|
else:
|
|
out.write("65535\n".encode())
|
|
|
|
out.write(data)
|
|
|
|
set_file_perms(path)
|
|
|
|
return path
|
|
|
|
|
|
def is_inklist_annotation(anno):
|
|
return 'Subtype' in anno and anno["Subtype"] == LIT('Ink')
|
|
|
|
|
|
def is_square_or_circle_annotation(anno):
|
|
if 'Subtype' in anno:
|
|
if anno["Subtype"] == LIT('Square') or anno["Subtype"] == LIT('Circle'):
|
|
return True
|
|
return False
|
|
|
|
|
|
def parse_pdf(fname, workdir, debug=0):
|
|
PDFDocument.debug = debug
|
|
PDFParser.debug = debug
|
|
|
|
fp = open(fname, 'rb')
|
|
parser = PDFParser(fp)
|
|
doc = PDFDocument(parser)
|
|
|
|
prod_boxes = []
|
|
scribbles = []
|
|
|
|
page_dict = resolve1(doc.catalog['Pages'])
|
|
pages = resolve1(page_dict['Kids'])
|
|
pagenum = 0
|
|
for page in pages:
|
|
pagenum += 1
|
|
page = resolve1(page)
|
|
if not 'Annots' in page: continue
|
|
|
|
mediabox = page['MediaBox']
|
|
# if 'CropBox' in page:
|
|
# cropbox = page['CropBox']
|
|
# print('crop',cropbox)
|
|
|
|
annots = page['Annots']
|
|
if isinstance(annots, PDFObjRef):
|
|
annots = resolve1(annots)
|
|
|
|
for anno in annots:
|
|
anno = resolve1(anno)
|
|
if is_inklist_annotation(anno):
|
|
scribbles.append(make_ink_scribble(anno, pagenum, mediabox, workdir))
|
|
elif is_square_or_circle_annotation(anno):
|
|
scribbles.append(make_square_or_circle_scribble(anno, pagenum, mediabox, workdir))
|
|
elif 'AAPL:AKExtras' in anno:
|
|
scribbles.append(make_aapl_scribble(anno, pagenum, mediabox, workdir))
|
|
elif 'ProCatName' in anno:
|
|
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
|
|
elif anno['Subtype'] == LIT('FreeText'):
|
|
print('ignoring FreeText annotation')
|
|
elif anno['Subtype'] == LIT('Highlight'):
|
|
print('ignoring Highlight annotation')
|
|
else:
|
|
print('ignoring other annotation:')
|
|
print(anno)
|
|
|
|
fp.close()
|
|
|
|
return [list(filter(None, prod_boxes)), list(filter(None, scribbles))]
|