Files
procat2/markup/pdf.py

206 lines
6.2 KiB
Python

import os
import sys
import subprocess
import shutil
import dumper
from pdfminer.psparser import LIT
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import PDFObjRef, resolve1
from .utils import Rect, pdf_rect, ensure_dir, set_file_perms
from .img import write_inklist
def make_product_box(obj, pagenum, mediabox):
rect = obj['Rect']
if rect:
name = obj['ProCatName'].decode() if 'ProCatName' in obj else ''
material = obj['ProCatMaterialNumber'].decode() if 'ProCatMaterialNumber' in obj else ''
color = obj['ProCatColor'].decode() if 'ProCatColor' in obj else ''
gender = obj['ProCatGender'].decode() if 'ProCatGender' in obj else ''
season = obj['ProCatSeason'].decode() if 'ProCatSeason' in obj else ''
size = obj['ProCatSize'].decode() if 'ProCatSize' in obj else ''
category = obj['ProCatCategory'].decode() if 'ProCatCategory' in obj else ''
return { 'material': material,
'name': name,
'color': color,
'gender': gender,
'season': season,
'size': size,
'category': category,
'rect': pdf_rect(rect, mediabox[3]),
'page': pagenum }
else:
print('Annotation without rect:')
print(dumper.dump(obj))
return None
def make_ink_scribble(obj, pagenum, mediabox, workdir):
oid = obj['NM'].decode('utf-8')
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png")
write_inklist(obj, png_path)
return { 'page': pagenum,
'rect': Rect(*mediabox),
'objid': oid,
'image': png_path }
def make_aapl_scribble(obj, pagenum, mediabox, workdir):
rect = obj['Rect'] # position on page
# walk the object tree down to the image
appearance = resolve1(obj['AP'])
normal_appearance = appearance['N']
if not normal_appearance or normal_appearance.objid <= 0:
print('skipping scribble - no normal appearance')
return
normal_appearance = resolve1(normal_appearance)
resources = resolve1(normal_appearance['Resources'])
xobj = resolve1(resources['XObject'])
im1 = resolve1(xobj['Im1']) # PDFStream of the image
flter = im1['Filter']
if flter.name == 'JPXDecode':
path = export_jp2(im1, workdir, pagenum)
return { 'page': pagenum,
'rect': pdf_rect(rect, mediabox[3]),
'objid': im1.objid,
'image': path }
elif flter.name == 'FlateDecode':
path = export_netpbm(im1, workdir, pagenum)
return { 'page': pagenum,
'rect': pdf_rect(rect, mediabox[3]),
'objid': im1.objid,
'image': path }
else:
print('skipping unrecognized image')
# print(dumper.dump(im1))
return None
def export_jp2(obj, workdir, pagenum):
oid = obj.objid
ensure_dir(workdir)
jp2_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.jp2")
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.png")
data = obj.get_rawdata()
print('extracting jp2: {}'.format(jp2_path))
with open(jp2_path, 'wb') as out:
out.write(data)
set_file_perms(jp2_path)
result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True)
if result.returncode != 0:
print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode()))
else:
set_file_perms(png_path)
return png_path
def export_netpbm(obj, workdir, pagenum):
oid = obj.objid
ensure_dir(workdir)
pbm_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}")
pbm_path = write_pbm(obj, pbm_base)
# stencil mask - use instead if present
smask = obj.attrs['SMask']
if smask:
print('extracting pbm mask')
mask = resolve1(smask)
mask_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}-mask")
mask_path = write_pbm(smask, mask_base)
pbm_path = mask_path
return pbm_path
def write_pbm(obj, base_path):
obj = resolve1(obj)
color_space = resolve1(obj.attrs['ColorSpace'])
suffix = '.pgm' if color_space == LIT('DeviceGray') else '.ppm'
path = base_path + suffix
print('writing pbm: {}'.format(path))
data = obj.get_data()
with open(path, 'wb') as out:
if suffix == '.pgm':
out.write("P5\n".encode())
else:
out.write("P6\n".encode())
out.write("{} {}\n".format(obj.attrs['Width'], obj.attrs['Height']).encode())
if obj.attrs['BitsPerComponent'] == 8:
out.write("255\n".encode())
else:
out.write("65535\n".encode())
out.write(data)
set_file_perms(path)
return path
def is_inklist_annotation(anno):
return 'Subtype' in anno and anno["Subtype"] == LIT('Ink')
def parse_pdf(fname, workdir, debug=0):
PDFDocument.debug = debug
PDFParser.debug = debug
fp = open(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
prod_boxes = []
scribbles = []
page_dict = resolve1(doc.catalog['Pages'])
pages = resolve1(page_dict['Kids'])
pagenum = 0
for page in pages:
pagenum += 1
page = resolve1(page)
if not 'Annots' in page: continue
mediabox = page['MediaBox']
# if 'CropBox' in page:
# cropbox = page['CropBox']
# print('crop',cropbox)
annots = page['Annots']
if isinstance(annots, PDFObjRef):
annots = resolve1(annots)
for anno in annots:
anno = resolve1(anno)
if is_inklist_annotation(anno):
scribbles.append(make_ink_scribble(anno, pagenum, mediabox, workdir))
elif 'AAPL:AKExtras' in anno:
scribbles.append(make_aapl_scribble(anno, pagenum, mediabox, workdir))
elif 'ProCatName' in anno:
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
else:
print('ignoring other annotation:')
print(anno)
fp.close()
return [list(filter(None, prod_boxes)), list(filter(None, scribbles))]