markup: handle netpbm encoded annotations

This commit is contained in:
2019-12-04 17:02:21 -08:00
parent ae5e95a36a
commit aa2a6bc4ca
2 changed files with 68 additions and 6 deletions

View File

@ -2,7 +2,9 @@ import os
import sys
import subprocess
import shutil
import dumper
from pdfminer.psparser import LIT
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import PDFObjRef, resolve1
@ -59,8 +61,15 @@ def make_scribble(obj, pagenum, mediabox, workdir):
'rect': pdf_rect(rect, mediabox[3]),
'objid': im1.objid,
'image': path }
elif flter.name == 'FlateDecode':
path = export_netpbm(im1, workdir, pagenum)
return { 'page': pagenum,
'rect': pdf_rect(rect, mediabox[3]),
'objid': im1.objid,
'image': path }
else:
print('skipping non-jp2 image')
print('skipping unrecognized image')
# print(dumper.dump(im1))
return None
@ -85,6 +94,55 @@ def export_jp2(obj, workdir, pagenum):
return png_path
def export_netpbm(obj, workdir, pagenum):
oid = obj.objid
ensure_dir(workdir)
pbm_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}")
pbm_path = write_pbm(obj, pbm_base)
# stencil mask - use instead if present
smask = obj.attrs['SMask']
if smask:
print('extracting pbm mask')
mask = resolve1(smask)
mask_base = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}-mask")
mask_path = write_pbm(smask, mask_base)
pbm_path = mask_path
return pbm_path
def write_pbm(obj, base_path):
obj = resolve1(obj)
color_space = resolve1(obj.attrs['ColorSpace'])
suffix = '.pgm' if color_space == LIT('DeviceGray') else '.ppm'
path = base_path + suffix
print('writing pbm: {}'.format(path))
data = obj.get_data()
with open(path, 'wb') as out:
if suffix == '.pgm':
out.write("P5\n".encode())
else:
out.write("P6\n".encode())
out.write("{} {}\n".format(obj.attrs['Width'], obj.attrs['Height']).encode())
if obj.attrs['BitsPerComponent'] == 8:
out.write("255\n".encode())
else:
out.write("65535\n".encode())
out.write(data)
set_file_perms(path)
return path
def parse_pdf(fname, workdir, debug=0):
PDFDocument.debug = debug
PDFParser.debug = debug