import os import sys import subprocess import shutil import dumper from pdfminer.psparser import LIT from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdftypes import PDFObjRef, resolve1 from .utils import pdf_rect, ensure_dir, set_file_perms def make_product_box(obj, pagenum, mediabox): rect = obj['Rect'] if rect: name = obj['ProCatName'].decode() if 'ProCatName' in obj else '' material = obj['ProCatMaterialNumber'].decode() if 'ProCatMaterialNumber' in obj else '' color = obj['ProCatColor'].decode() if 'ProCatColor' in obj else '' gender = obj['ProCatGender'].decode() if 'ProCatGender' in obj else '' season = obj['ProCatSeason'].decode() if 'ProCatSeason' in obj else '' size = obj['ProCatSize'].decode() if 'ProCatSize' in obj else '' category = obj['ProCatCategory'].decode() if 'ProCatCategory' in obj else '' return { 'material': material, 'name': name, 'color': color, 'gender': gender, 'season': season, 'size': size, 'category': category, 'rect': pdf_rect(rect, mediabox[3]), 'page': pagenum } else: print('Annotation without rect:') print(dumper.dump(obj)) return None def make_scribble(obj, pagenum, mediabox, workdir): rect = obj['Rect'] # position on page # walk the object tree down to the image appearance = resolve1(obj['AP']) normal_appearance = appearance['N'] if not normal_appearance or normal_appearance.objid <= 0: print('skipping scribble - no normal appearance') return normal_appearance = resolve1(normal_appearance) resources = resolve1(normal_appearance['Resources']) xobj = resolve1(resources['XObject']) im1 = resolve1(xobj['Im1']) # PDFStream of the image flter = im1['Filter'] if flter.name == 'JPXDecode': path = export_jp2(im1, workdir, pagenum) return { 'page': pagenum, 'rect': pdf_rect(rect, mediabox[3]), 'objid': im1.objid, 'image': path } elif flter.name == 'FlateDecode': path = export_netpbm(im1, workdir, pagenum) return { 'page': pagenum, 'rect': pdf_rect(rect, mediabox[3]), 'objid': im1.objid, 'image': path } else: print('skipping unrecognized image') # print(dumper.dump(im1)) return None def export_jp2(obj, workdir, pagenum): oid = obj.objid ensure_dir(workdir) jp2_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.jp2") png_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.png") data = obj.get_rawdata() print('extracting jp2: {}'.format(jp2_path)) with open(jp2_path, 'wb') as out: out.write(data) set_file_perms(jp2_path) result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True) if result.returncode != 0: print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode())) else: set_file_perms(png_path) return png_path def export_netpbm(obj, workdir, pagenum): oid = obj.objid ensure_dir(workdir) pbm_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}") pbm_path = write_pbm(obj, pbm_base) # stencil mask - use instead if present smask = obj.attrs['SMask'] if smask: print('extracting pbm mask') mask = resolve1(smask) mask_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}-mask") mask_path = write_pbm(smask, mask_base) pbm_path = mask_path return pbm_path def write_pbm(obj, base_path): obj = resolve1(obj) color_space = resolve1(obj.attrs['ColorSpace']) suffix = '.pgm' if color_space == LIT('DeviceGray') else '.ppm' path = base_path + suffix print('writing pbm: {}'.format(path)) data = obj.get_data() with open(path, 'wb') as out: if suffix == '.pgm': out.write("P5\n".encode()) else: out.write("P6\n".encode()) out.write("{} {}\n".format(obj.attrs['Width'], obj.attrs['Height']).encode()) if obj.attrs['BitsPerComponent'] == 8: out.write("255\n".encode()) else: out.write("65535\n".encode()) out.write(data) set_file_perms(path) return path def parse_pdf(fname, workdir, debug=0): PDFDocument.debug = debug PDFParser.debug = debug fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) prod_boxes = [] scribbles = [] page_dict = resolve1(doc.catalog['Pages']) pages = resolve1(page_dict['Kids']) pagenum = 0 for page in pages: pagenum += 1 page = resolve1(page) if not 'Annots' in page: continue mediabox = page['MediaBox'] # if 'CropBox' in page: # cropbox = page['CropBox'] # print('crop',cropbox) annots = page['Annots'] if isinstance(annots, PDFObjRef): annots = resolve1(annots) for anno in annots: anno = resolve1(anno) if 'AAPL:AKExtras' in anno: scribbles.append(make_scribble(anno, pagenum, mediabox, workdir)) elif 'ProCatName' in anno: prod_boxes.append(make_product_box(anno, pagenum, mediabox)) else: print('ignoring other annotation:') print(anno) fp.close() return [list(filter(None, prod_boxes)), list(filter(None, scribbles))]