markup: pass workdir around, don't recalc all the time
This commit is contained in:
@ -76,11 +76,10 @@ def find_shapes(image_path):
|
|||||||
return img.width, img.height, bboxes
|
return img.width, img.height, bboxes
|
||||||
|
|
||||||
|
|
||||||
def write_debug_image(subdir, name, page_num, prods, scribbles):
|
def write_debug_image(workdir, page_num, prods, scribbles):
|
||||||
"""Draw an image with boxes for products, images, and shapes."""
|
"""Draw an image with boxes for products, images, and shapes."""
|
||||||
dir = os.path.join(WORKDIR, subdir, name)
|
ensure_dir(workdir)
|
||||||
ensure_dir(dir)
|
path = os.path.join(workdir, f"debug-page{page_num:03d}.png")
|
||||||
path = os.path.join(dir, f"debug-page{page_num:03d}.png")
|
|
||||||
|
|
||||||
pagew = int(11*72)
|
pagew = int(11*72)
|
||||||
pageh = int(8.5*72)
|
pageh = int(8.5*72)
|
||||||
|
|||||||
@ -3,9 +3,9 @@ from .pdf import parse_pdf
|
|||||||
from .utils import overlaps
|
from .utils import overlaps
|
||||||
|
|
||||||
|
|
||||||
def find_marked_products(pdf, subdir, catname, debug=0):
|
def find_marked_products(pdf, workdir, debug=0):
|
||||||
"""Main entry point. Give a pdf, get matches."""
|
"""Main entry point. Give a pdf, get matches."""
|
||||||
(prods, scribbles) = parse_pdf(pdf, subdir, catname, debug)
|
(prods, scribbles) = parse_pdf(pdf, workdir, debug)
|
||||||
|
|
||||||
if not prods or len(prods) < 1:
|
if not prods or len(prods) < 1:
|
||||||
print('no product placement markers found')
|
print('no product placement markers found')
|
||||||
@ -15,7 +15,7 @@ def find_marked_products(pdf, subdir, catname, debug=0):
|
|||||||
matches = find_matches(prods, scribbles, 0.10)
|
matches = find_matches(prods, scribbles, 0.10)
|
||||||
|
|
||||||
for s in scribbles:
|
for s in scribbles:
|
||||||
write_debug_image(subdir, catname, s['page'], prods, scribbles)
|
write_debug_image(workdir, s['page'], prods, scribbles)
|
||||||
|
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from pdfminer.pdfparser import PDFParser
|
|||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
from pdfminer.pdftypes import PDFObjRef, resolve1
|
from pdfminer.pdftypes import PDFObjRef, resolve1
|
||||||
|
|
||||||
from .utils import pdf_rect, ensure_dir, set_file_perms, WORKDIR
|
from .utils import pdf_rect, ensure_dir, set_file_perms
|
||||||
|
|
||||||
|
|
||||||
def make_product_box(obj, pagenum, mediabox):
|
def make_product_box(obj, pagenum, mediabox):
|
||||||
@ -33,7 +33,7 @@ def make_product_box(obj, pagenum, mediabox):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def make_scribble(obj, pagenum, mediabox, subdir, name):
|
def make_scribble(obj, pagenum, mediabox, workdir):
|
||||||
rect = obj['Rect'] # position on page
|
rect = obj['Rect'] # position on page
|
||||||
|
|
||||||
# walk the object tree down to the image
|
# walk the object tree down to the image
|
||||||
@ -50,7 +50,7 @@ def make_scribble(obj, pagenum, mediabox, subdir, name):
|
|||||||
|
|
||||||
flter = im1['Filter']
|
flter = im1['Filter']
|
||||||
if flter.name == 'JPXDecode':
|
if flter.name == 'JPXDecode':
|
||||||
path = export_jp2(im1, subdir, name, pagenum)
|
path = export_jp2(im1, workdir, pagenum)
|
||||||
return { 'page': pagenum,
|
return { 'page': pagenum,
|
||||||
'rect': pdf_rect(rect, mediabox[3]),
|
'rect': pdf_rect(rect, mediabox[3]),
|
||||||
'objid': im1.objid,
|
'objid': im1.objid,
|
||||||
@ -60,12 +60,11 @@ def make_scribble(obj, pagenum, mediabox, subdir, name):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def export_jp2(obj, subdir, name, pagenum):
|
def export_jp2(obj, workdir, pagenum):
|
||||||
oid = obj.objid
|
oid = obj.objid
|
||||||
dir = os.path.join(WORKDIR, subdir, name)
|
ensure_dir(workdir)
|
||||||
ensure_dir(dir)
|
jp2_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.jp2")
|
||||||
jp2_path = os.path.join(dir, f"export-page{pagenum:03d}-obj{oid:05d}.jp2")
|
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-obj{oid:05d}.png")
|
||||||
png_path = os.path.join(dir, f"export-page{pagenum:03d}-obj{oid:05d}.png")
|
|
||||||
|
|
||||||
data = obj.get_rawdata()
|
data = obj.get_rawdata()
|
||||||
print('extracting jp2: {}'.format(jp2_path))
|
print('extracting jp2: {}'.format(jp2_path))
|
||||||
@ -82,7 +81,7 @@ def export_jp2(obj, subdir, name, pagenum):
|
|||||||
return png_path
|
return png_path
|
||||||
|
|
||||||
|
|
||||||
def parse_pdf(fname, subdir, name, debug=0):
|
def parse_pdf(fname, workdir, debug=0):
|
||||||
PDFDocument.debug = debug
|
PDFDocument.debug = debug
|
||||||
PDFParser.debug = debug
|
PDFParser.debug = debug
|
||||||
|
|
||||||
@ -113,7 +112,7 @@ def parse_pdf(fname, subdir, name, debug=0):
|
|||||||
for anno in annots:
|
for anno in annots:
|
||||||
anno = resolve1(anno)
|
anno = resolve1(anno)
|
||||||
if 'AAPL:AKExtras' in anno:
|
if 'AAPL:AKExtras' in anno:
|
||||||
scribbles.append(make_scribble(anno, pagenum, mediabox, subdir, name))
|
scribbles.append(make_scribble(anno, pagenum, mediabox, workdir))
|
||||||
elif 'ProCatName' in anno:
|
elif 'ProCatName' in anno:
|
||||||
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
|
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -15,16 +15,19 @@ def format_season(s):
|
|||||||
return s[:1] + s[2:]
|
return s[:1] + s[2:]
|
||||||
|
|
||||||
|
|
||||||
def write_spreadsheet(matches, subdir, catname, filename):
|
def write_spreadsheet(matches, workdir, file_base):
|
||||||
if not matches:
|
if not matches:
|
||||||
print('write_spreadsheet: no matches. skipping.')
|
print('write_spreadsheet: no matches. skipping.')
|
||||||
return
|
return None
|
||||||
|
|
||||||
wb = Workbook()
|
wb = Workbook()
|
||||||
ws = wb.active
|
ws = wb.active
|
||||||
|
|
||||||
|
# header row
|
||||||
ws.append(['Season', 'Gender', 'Name', 'Style Number', 'Color'])
|
ws.append(['Season', 'Gender', 'Name', 'Style Number', 'Color'])
|
||||||
|
|
||||||
|
# TODO: uniquify and sort matches
|
||||||
|
|
||||||
for m in matches:
|
for m in matches:
|
||||||
# in the case of kids,
|
# in the case of kids,
|
||||||
# we might have multiple products in a match
|
# we might have multiple products in a match
|
||||||
@ -37,14 +40,10 @@ def write_spreadsheet(matches, subdir, catname, filename):
|
|||||||
for s, g, n, m, c in zip_longest(seasons, genders, names, materials, colors, fillvalue=''):
|
for s, g, n, m, c in zip_longest(seasons, genders, names, materials, colors, fillvalue=''):
|
||||||
ws.append([format_season(s), g, n, m, c])
|
ws.append([format_season(s), g, n, m, c])
|
||||||
|
|
||||||
# # Python types will automatically be converted
|
|
||||||
# import datetime
|
|
||||||
# ws['A2'] = datetime.datetime.now()
|
|
||||||
# ws['A2'].style = 'Good'
|
|
||||||
|
|
||||||
# save
|
# save
|
||||||
dir = os.path.join(WORKDIR, subdir, catname)
|
ensure_dir(workdir)
|
||||||
ensure_dir(dir)
|
path = os.path.join(workdir, f"{file_base}.xlsx")
|
||||||
path = os.path.join(dir, f"{filename}.xlsx")
|
|
||||||
wb.save(path)
|
wb.save(path)
|
||||||
set_file_perms(path)
|
set_file_perms(path)
|
||||||
|
|
||||||
|
return path
|
||||||
|
|||||||
@ -19,6 +19,7 @@ from django.conf import settings
|
|||||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'procat2.settings')
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'procat2.settings')
|
||||||
django.setup()
|
django.setup()
|
||||||
|
|
||||||
|
from markup.utils import WORKDIR, clean_path
|
||||||
from markup.matching import find_marked_products
|
from markup.matching import find_marked_products
|
||||||
from markup.spreadsheet import write_spreadsheet
|
from markup.spreadsheet import write_spreadsheet
|
||||||
|
|
||||||
@ -40,12 +41,11 @@ def main(argv):
|
|||||||
|
|
||||||
fname = args[0]
|
fname = args[0]
|
||||||
path = Path(fname)
|
path = Path(fname)
|
||||||
catname = path.stem
|
workdir = os.path.join(WORKDIR, 'test', clean_path(path.stem))
|
||||||
catname = re.sub(r'[^\w]', '_', catname)
|
|
||||||
|
|
||||||
matches = find_marked_products(fname, subdir, catname, debug=0)
|
matches = find_marked_products(fname, workdir, debug)
|
||||||
print(f'{len(matches)} product matches')
|
print(f'{len(matches)} product matches')
|
||||||
write_spreadsheet(matches, subdir, catname, path.stem)
|
write_spreadsheet(matches, workdir, path.stem)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|||||||
Reference in New Issue
Block a user