markup: pass workdir around, don't recalc all the time

This commit is contained in:
2019-10-19 22:08:42 -07:00
parent ed5236c618
commit e434d3f705
5 changed files with 28 additions and 31 deletions

View File

@ -7,7 +7,7 @@ from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import PDFObjRef, resolve1
from .utils import pdf_rect, ensure_dir, set_file_perms, WORKDIR
from .utils import pdf_rect, ensure_dir, set_file_perms
def make_product_box(obj, pagenum, mediabox):
@ -33,7 +33,7 @@ def make_product_box(obj, pagenum, mediabox):
return None
def make_scribble(obj, pagenum, mediabox, subdir, name):
def make_scribble(obj, pagenum, mediabox, workdir):
rect = obj['Rect'] # position on page
# walk the object tree down to the image
@ -50,7 +50,7 @@ def make_scribble(obj, pagenum, mediabox, subdir, name):
flter = im1['Filter']
if flter.name == 'JPXDecode':
path = export_jp2(im1, subdir, name, pagenum)
path = export_jp2(im1, workdir, pagenum)
return { 'page': pagenum,
'rect': pdf_rect(rect, mediabox[3]),
'objid': im1.objid,
@ -60,12 +60,11 @@ def make_scribble(obj, pagenum, mediabox, subdir, name):
return None
def export_jp2(obj, subdir, name, pagenum):
def export_jp2(obj, workdir, pagenum):
oid = obj.objid
dir = os.path.join(WORKDIR, subdir, name)
ensure_dir(dir)
jp2_path = os.path.join(dir, f"export-page{pagenum:03d}-obj{oid:05d}.jp2")
png_path = os.path.join(dir, f"export-page{pagenum:03d}-obj{oid:05d}.png")
ensure_dir(workdir)
jp2_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.jp2")
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.png")
data = obj.get_rawdata()
print('extracting jp2: {}'.format(jp2_path))
@ -82,7 +81,7 @@ def export_jp2(obj, subdir, name, pagenum):
return png_path
def parse_pdf(fname, subdir, name, debug=0):
def parse_pdf(fname, workdir, debug=0):
PDFDocument.debug = debug
PDFParser.debug = debug
@ -113,7 +112,7 @@ def parse_pdf(fname, subdir, name, debug=0):
for anno in annots:
anno = resolve1(anno)
if 'AAPL:AKExtras' in anno:
scribbles.append(make_scribble(anno, pagenum, mediabox, subdir, name))
scribbles.append(make_scribble(anno, pagenum, mediabox, workdir))
elif 'ProCatName' in anno:
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
else: