import os import sys import subprocess import shutil import dumper from pdfminer.psparser import LIT from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdftypes import PDFObjRef, resolve1 from .utils import Rect, pdf_rect, ensure_dir, set_file_perms from .img import write_inklist, write_square_or_circle def make_product_box(obj, pagenum, mediabox): rect = obj['Rect'] if rect: name = obj['ProCatName'].decode() if 'ProCatName' in obj else '' material = obj['ProCatMaterialNumber'].decode() if 'ProCatMaterialNumber' in obj else '' color = obj['ProCatColor'].decode() if 'ProCatColor' in obj else '' gender = obj['ProCatGender'].decode() if 'ProCatGender' in obj else '' season = obj['ProCatSeason'].decode() if 'ProCatSeason' in obj else '' size = obj['ProCatSize'].decode() if 'ProCatSize' in obj else '' category = obj['ProCatCategory'].decode() if 'ProCatCategory' in obj else '' return { 'material': material, 'name': name, 'color': color, 'gender': gender, 'season': season, 'size': size, 'category': category, 'rect': pdf_rect(rect, mediabox[3]), 'page': pagenum } else: print('Annotation without rect:') print(dumper.dump(obj)) return None def make_ink_scribble(obj, pagenum, mediabox, workdir): oid = obj['NM'].decode('utf-8') png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png") write_inklist(obj, mediabox, png_path) return { 'page': pagenum, 'rect': Rect(*mediabox), 'objid': oid, 'image': png_path } def make_square_or_circle_scribble(obj, pagenum, mediabox, workdir): oid = obj['NM'].decode('utf-8') png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png") write_square_or_circle(obj, mediabox, png_path) return { 'page': pagenum, 'rect': Rect(*mediabox), 'objid': oid, 'image': png_path } def make_aapl_scribble(obj, pagenum, mediabox, workdir): rect = obj['Rect'] # position on page # walk the object tree down to the image appearance = resolve1(obj['AP']) normal_appearance = appearance['N'] if not normal_appearance or normal_appearance.objid <= 0: print('skipping scribble - no normal appearance') return normal_appearance = resolve1(normal_appearance) resources = resolve1(normal_appearance['Resources']) xobj = resolve1(resources['XObject']) im1 = resolve1(xobj['Im1']) # PDFStream of the image flter = im1['Filter'] if flter.name == 'JPXDecode': path = export_jp2(im1, workdir, pagenum) return { 'page': pagenum, 'rect': pdf_rect(rect, mediabox[3]), 'objid': im1.objid, 'image': path } elif flter.name == 'FlateDecode': path = export_netpbm(im1, workdir, pagenum) return { 'page': pagenum, 'rect': pdf_rect(rect, mediabox[3]), 'objid': im1.objid, 'image': path } else: print('skipping unrecognized image') # print(dumper.dump(im1)) return None def export_jp2(obj, workdir, pagenum): oid = obj.objid ensure_dir(workdir) jp2_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.jp2") png_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.png") data = obj.get_rawdata() print('extracting jp2: {}'.format(jp2_path)) with open(jp2_path, 'wb') as out: out.write(data) set_file_perms(jp2_path) result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True) if result.returncode != 0: print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode())) else: set_file_perms(png_path) return png_path def export_netpbm(obj, workdir, pagenum): oid = obj.objid ensure_dir(workdir) pbm_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}") pbm_path = write_pbm(obj, pbm_base) # stencil mask - use instead if present smask = obj.attrs['SMask'] if smask: print('extracting pbm mask') mask = resolve1(smask) mask_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}-mask") mask_path = write_pbm(smask, mask_base) pbm_path = mask_path return pbm_path def write_pbm(obj, base_path): obj = resolve1(obj) color_space = resolve1(obj.attrs['ColorSpace']) suffix = '.pgm' if color_space == LIT('DeviceGray') else '.ppm' path = base_path + suffix print('writing pbm: {}'.format(path)) data = obj.get_data() with open(path, 'wb') as out: if suffix == '.pgm': out.write("P5\n".encode()) else: out.write("P6\n".encode()) out.write("{} {}\n".format(obj.attrs['Width'], obj.attrs['Height']).encode()) if obj.attrs['BitsPerComponent'] == 8: out.write("255\n".encode()) else: out.write("65535\n".encode()) out.write(data) set_file_perms(path) return path def is_inklist_annotation(anno): return 'Subtype' in anno and anno["Subtype"] == LIT('Ink') def is_square_or_circle_annotation(anno): if 'Subtype' in anno: if anno["Subtype"] == LIT('Square') or anno["Subtype"] == LIT('Circle'): return True return False def parse_pdf(fname, workdir, debug=0): PDFDocument.debug = debug PDFParser.debug = debug fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) prod_boxes = [] scribbles = [] page_dict = resolve1(doc.catalog['Pages']) pages = resolve1(page_dict['Kids']) pagenum = 0 for page in pages: pagenum += 1 page = resolve1(page) if not 'Annots' in page: continue mediabox = page['MediaBox'] # if 'CropBox' in page: # cropbox = page['CropBox'] # print('crop',cropbox) annots = page['Annots'] if isinstance(annots, PDFObjRef): annots = resolve1(annots) for anno in annots: anno = resolve1(anno) if is_inklist_annotation(anno): scribbles.append(make_ink_scribble(anno, pagenum, mediabox, workdir)) elif is_square_or_circle_annotation(anno): scribbles.append(make_square_or_circle_scribble(anno, pagenum, mediabox, workdir)) elif 'AAPL:AKExtras' in anno: scribbles.append(make_aapl_scribble(anno, pagenum, mediabox, workdir)) elif 'ProCatName' in anno: prod_boxes.append(make_product_box(anno, pagenum, mediabox)) elif anno['Subtype'] == LIT('FreeText'): print('ignoring FreeText annotation') elif anno['Subtype'] == LIT('Highlight'): print('ignoring Highlight annotation') else: print('ignoring other annotation:') print(anno) fp.close() return [list(filter(None, prod_boxes)), list(filter(None, scribbles))]