Files
procat2/markup/pdf.py

231 lines
7.2 KiB
Python

import os
import sys
import subprocess
import shutil
import dumper
from pdfminer.psparser import LIT
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import PDFObjRef, resolve1
from .utils import Rect, pdf_rect, ensure_dir, set_file_perms
from .img import write_inklist, write_square_or_circle
def make_product_box(obj, pagenum, mediabox):
rect = obj['Rect']
if rect:
name = obj['ProCatName'].decode() if 'ProCatName' in obj else ''
material = obj['ProCatMaterialNumber'].decode() if 'ProCatMaterialNumber' in obj else ''
color = obj['ProCatColor'].decode() if 'ProCatColor' in obj else ''
gender = obj['ProCatGender'].decode() if 'ProCatGender' in obj else ''
season = obj['ProCatSeason'].decode() if 'ProCatSeason' in obj else ''
size = obj['ProCatSize'].decode() if 'ProCatSize' in obj else ''
category = obj['ProCatCategory'].decode() if 'ProCatCategory' in obj else ''
return { 'material': material,
'name': name,
'color': color,
'gender': gender,
'season': season,
'size': size,
'category': category,
'rect': pdf_rect(rect, mediabox[3]),
'page': pagenum }
else:
print('Annotation without rect:')
print(dumper.dump(obj))
return None
def make_ink_scribble(obj, pagenum, mediabox, workdir):
oid = obj['NM'].decode('utf-8')
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png")
write_inklist(obj, mediabox, png_path)
return { 'page': pagenum,
'rect': Rect(*mediabox),
'objid': oid,
'image': png_path }
def make_square_or_circle_scribble(obj, pagenum, mediabox, workdir):
oid = obj['NM'].decode('utf-8')
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png")
write_square_or_circle(obj, mediabox, png_path)
return { 'page': pagenum,
'rect': Rect(*mediabox),
'objid': oid,
'image': png_path }
def make_aapl_scribble(obj, pagenum, mediabox, workdir):
rect = obj['Rect'] # position on page
# walk the object tree down to the image
appearance = resolve1(obj['AP'])
normal_appearance = appearance['N']
if not normal_appearance or normal_appearance.objid <= 0:
print('skipping scribble - no normal appearance')
return
normal_appearance = resolve1(normal_appearance)
resources = resolve1(normal_appearance['Resources'])
xobj = resolve1(resources['XObject'])
im1 = resolve1(xobj['Im1']) # PDFStream of the image
flter = im1['Filter']
if flter.name == 'JPXDecode':
path = export_jp2(im1, workdir, pagenum)
return { 'page': pagenum,
'rect': pdf_rect(rect, mediabox[3]),
'objid': im1.objid,
'image': path }
elif flter.name == 'FlateDecode':
path = export_netpbm(im1, workdir, pagenum)
return { 'page': pagenum,
'rect': pdf_rect(rect, mediabox[3]),
'objid': im1.objid,
'image': path }
else:
print('skipping unrecognized image')
# print(dumper.dump(im1))
return None
def export_jp2(obj, workdir, pagenum):
oid = obj.objid
ensure_dir(workdir)
jp2_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.jp2")
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.png")
data = obj.get_rawdata()
print('extracting jp2: {}'.format(jp2_path))
with open(jp2_path, 'wb') as out:
out.write(data)
set_file_perms(jp2_path)
result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True)
if result.returncode != 0:
print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode()))
else:
set_file_perms(png_path)
return png_path
def export_netpbm(obj, workdir, pagenum):
oid = obj.objid
ensure_dir(workdir)
pbm_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}")
pbm_path = write_pbm(obj, pbm_base)
# stencil mask - use instead if present
smask = obj.attrs['SMask']
if smask:
print('extracting pbm mask')
mask = resolve1(smask)
mask_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}-mask")
mask_path = write_pbm(smask, mask_base)
pbm_path = mask_path
return pbm_path
def write_pbm(obj, base_path):
obj = resolve1(obj)
color_space = resolve1(obj.attrs['ColorSpace'])
suffix = '.pgm' if color_space == LIT('DeviceGray') else '.ppm'
path = base_path + suffix
print('writing pbm: {}'.format(path))
data = obj.get_data()
with open(path, 'wb') as out:
if suffix == '.pgm':
out.write("P5\n".encode())
else:
out.write("P6\n".encode())
out.write("{} {}\n".format(obj.attrs['Width'], obj.attrs['Height']).encode())
if obj.attrs['BitsPerComponent'] == 8:
out.write("255\n".encode())
else:
out.write("65535\n".encode())
out.write(data)
set_file_perms(path)
return path
def is_inklist_annotation(anno):
return 'Subtype' in anno and anno["Subtype"] == LIT('Ink')
def is_square_or_circle_annotation(anno):
if 'Subtype' in anno:
if anno["Subtype"] == LIT('Square') or anno["Subtype"] == LIT('Circle'):
return True
return False
def parse_pdf(fname, workdir, debug=0):
PDFDocument.debug = debug
PDFParser.debug = debug
fp = open(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
prod_boxes = []
scribbles = []
page_dict = resolve1(doc.catalog['Pages'])
pages = resolve1(page_dict['Kids'])
pagenum = 0
for page in pages:
pagenum += 1
page = resolve1(page)
if not 'Annots' in page: continue
mediabox = page['MediaBox']
# if 'CropBox' in page:
# cropbox = page['CropBox']
# print('crop',cropbox)
annots = page['Annots']
if isinstance(annots, PDFObjRef):
annots = resolve1(annots)
for anno in annots:
anno = resolve1(anno)
if is_inklist_annotation(anno):
scribbles.append(make_ink_scribble(anno, pagenum, mediabox, workdir))
elif is_square_or_circle_annotation(anno):
scribbles.append(make_square_or_circle_scribble(anno, pagenum, mediabox, workdir))
elif 'AAPL:AKExtras' in anno:
scribbles.append(make_aapl_scribble(anno, pagenum, mediabox, workdir))
elif 'ProCatName' in anno:
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
elif anno['Subtype'] == LIT('FreeText'):
print('ignoring FreeText annotation')
elif anno['Subtype'] == LIT('Highlight'):
print('ignoring Highlight annotation')
else:
print('ignoring other annotation:')
print(anno)
fp.close()
return [list(filter(None, prod_boxes)), list(filter(None, scribbles))]