From aa2a6bc4ca3bdce0eb4f1496d3cf935cda86fbbf Mon Sep 17 00:00:00 2001 From: Seth Ladygo Date: Wed, 4 Dec 2019 17:02:21 -0800 Subject: [PATCH] markup: handle netpbm encoded annotations --- markup/img.py | 14 +++++++----- markup/pdf.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/markup/img.py b/markup/img.py index dfcf031..ebeb5f8 100644 --- a/markup/img.py +++ b/markup/img.py @@ -20,12 +20,16 @@ def find_shapes(image_path): """ path = Path(image_path) - img = Image.open(image_path, 'r') - if not img.mode in ('RGBA', 'LA'): - print('no alpha channel: {}'.format(img.mode)) - return None + print('finding shapes in {}'.format(image_path)) - alpha_layer = img.convert('RGBA').split()[-1] + img = Image.open(image_path, 'r') + if img.mode == 'RGBA': + alpha_layer = img.convert('RGBA').split()[-1] + elif img.mode == 'L': + alpha_layer = img + else: + print('unhandled image mode: {}'.format(img.mode)) + return None alpha_layer = alpha_layer.filter(ImageFilter.GaussianBlur(5)) diff --git a/markup/pdf.py b/markup/pdf.py index ff1d7eb..863f5b0 100644 --- a/markup/pdf.py +++ b/markup/pdf.py @@ -2,7 +2,9 @@ import os import sys import subprocess import shutil +import dumper +from pdfminer.psparser import LIT from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdftypes import PDFObjRef, resolve1 @@ -59,8 +61,15 @@ def make_scribble(obj, pagenum, mediabox, workdir): 'rect': pdf_rect(rect, mediabox[3]), 'objid': im1.objid, 'image': path } + elif flter.name == 'FlateDecode': + path = export_netpbm(im1, workdir, pagenum) + return { 'page': pagenum, + 'rect': pdf_rect(rect, mediabox[3]), + 'objid': im1.objid, + 'image': path } else: - print('skipping non-jp2 image') + print('skipping unrecognized image') + # print(dumper.dump(im1)) return None @@ -85,6 +94,55 @@ def export_jp2(obj, workdir, pagenum): return png_path +def export_netpbm(obj, workdir, pagenum): + oid = obj.objid + ensure_dir(workdir) + + pbm_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}") + pbm_path = write_pbm(obj, pbm_base) + + # stencil mask - use instead if present + smask = obj.attrs['SMask'] + if smask: + print('extracting pbm mask') + mask = resolve1(smask) + mask_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}-mask") + mask_path = write_pbm(smask, mask_base) + pbm_path = mask_path + + return pbm_path + + +def write_pbm(obj, base_path): + obj = resolve1(obj) + color_space = resolve1(obj.attrs['ColorSpace']) + + suffix = '.pgm' if color_space == LIT('DeviceGray') else '.ppm' + path = base_path + suffix + + print('writing pbm: {}'.format(path)) + + data = obj.get_data() + with open(path, 'wb') as out: + if suffix == '.pgm': + out.write("P5\n".encode()) + else: + out.write("P6\n".encode()) + + out.write("{} {}\n".format(obj.attrs['Width'], obj.attrs['Height']).encode()) + + if obj.attrs['BitsPerComponent'] == 8: + out.write("255\n".encode()) + else: + out.write("65535\n".encode()) + + out.write(data) + + set_file_perms(path) + + return path + + def parse_pdf(fname, workdir, debug=0): PDFDocument.debug = debug PDFParser.debug = debug