markup: handle netpbm encoded annotations

This commit is contained in:
2019-12-04 17:02:21 -08:00
parent ae5e95a36a
commit aa2a6bc4ca
2 changed files with 68 additions and 6 deletions

View File

@ -20,12 +20,16 @@ def find_shapes(image_path):
""" """
path = Path(image_path) path = Path(image_path)
img = Image.open(image_path, 'r') print('finding shapes in {}'.format(image_path))
if not img.mode in ('RGBA', 'LA'):
print('no alpha channel: {}'.format(img.mode))
return None
alpha_layer = img.convert('RGBA').split()[-1] img = Image.open(image_path, 'r')
if img.mode == 'RGBA':
alpha_layer = img.convert('RGBA').split()[-1]
elif img.mode == 'L':
alpha_layer = img
else:
print('unhandled image mode: {}'.format(img.mode))
return None
alpha_layer = alpha_layer.filter(ImageFilter.GaussianBlur(5)) alpha_layer = alpha_layer.filter(ImageFilter.GaussianBlur(5))

View File

@ -2,7 +2,9 @@ import os
import sys import sys
import subprocess import subprocess
import shutil import shutil
import dumper
from pdfminer.psparser import LIT
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import PDFObjRef, resolve1 from pdfminer.pdftypes import PDFObjRef, resolve1
@ -59,8 +61,15 @@ def make_scribble(obj, pagenum, mediabox, workdir):
'rect': pdf_rect(rect, mediabox[3]), 'rect': pdf_rect(rect, mediabox[3]),
'objid': im1.objid, 'objid': im1.objid,
'image': path } 'image': path }
elif flter.name == 'FlateDecode':
path = export_netpbm(im1, workdir, pagenum)
return { 'page': pagenum,
'rect': pdf_rect(rect, mediabox[3]),
'objid': im1.objid,
'image': path }
else: else:
print('skipping non-jp2 image') print('skipping unrecognized image')
# print(dumper.dump(im1))
return None return None
@ -85,6 +94,55 @@ def export_jp2(obj, workdir, pagenum):
return png_path return png_path
def export_netpbm(obj, workdir, pagenum):
oid = obj.objid
ensure_dir(workdir)
pbm_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}")
pbm_path = write_pbm(obj, pbm_base)
# stencil mask - use instead if present
smask = obj.attrs['SMask']
if smask:
print('extracting pbm mask')
mask = resolve1(smask)
mask_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}-mask")
mask_path = write_pbm(smask, mask_base)
pbm_path = mask_path
return pbm_path
def write_pbm(obj, base_path):
obj = resolve1(obj)
color_space = resolve1(obj.attrs['ColorSpace'])
suffix = '.pgm' if color_space == LIT('DeviceGray') else '.ppm'
path = base_path + suffix
print('writing pbm: {}'.format(path))
data = obj.get_data()
with open(path, 'wb') as out:
if suffix == '.pgm':
out.write("P5\n".encode())
else:
out.write("P6\n".encode())
out.write("{} {}\n".format(obj.attrs['Width'], obj.attrs['Height']).encode())
if obj.attrs['BitsPerComponent'] == 8:
out.write("255\n".encode())
else:
out.write("65535\n".encode())
out.write(data)
set_file_perms(path)
return path
def parse_pdf(fname, workdir, debug=0): def parse_pdf(fname, workdir, debug=0):
PDFDocument.debug = debug PDFDocument.debug = debug
PDFParser.debug = debug PDFParser.debug = debug