diff --git a/markup/img.py b/markup/img.py index 3506a5f..86e4f77 100644 --- a/markup/img.py +++ b/markup/img.py @@ -11,7 +11,7 @@ from pathlib import Path from django.conf import settings -from .utils import cv2_rect +from .utils import cv2_rect, set_file_perms WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work') @@ -19,6 +19,9 @@ WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work') def find_shapes(image_path): + """Find shapes in the image, returning bounding boxes around each. + Writes debug images next to the input image. + """ path = Path(image_path) img = Image.open(image_path, 'r') @@ -39,14 +42,12 @@ def find_shapes(image_path): # thresh = cv2.threshold(blurred, 60, 255, cv2.THRESH_BINARY)[1] thresh_path = str(path.with_suffix('.thresh.png')) - # print('write to', thresh_path) cv2.imwrite(thresh_path, threshold) os.chmod(thresh_path, 0o664) shutil.chown(thresh_path, group='procat') contours = cv2.findContours(threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours = imutils.grab_contours(contours) - # print("{} shapes".format(len(contours))) bboxes = [] for c in contours: @@ -58,13 +59,11 @@ def find_shapes(image_path): # if M["m00"] == 0: M["m00"] = 0.00001 # cX = int(M["m10"] / M["m00"]) # cY = int(M["m01"] / M["m00"]) - #print('add contour rect: {}'.format(cv2_rect(x, y, w, h))) bboxes.append(cv2_rect(x, y, w, h)) # draw contours contour_image = numpy.zeros((threshold.shape[0], threshold.shape[1], 3), dtype=numpy.uint8) for i in range(len(contours)): - # compute the center of the contour color = (rng.randint(0,512), rng.randint(0,512), rng.randint(0,512)) cv2.drawContours(contour_image, contours, i, color) rect = bboxes[i] @@ -74,7 +73,6 @@ def find_shapes(image_path): # cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1) contour_path = str(path.with_suffix('.contour.png')) - #print('write to', contour_path) cv2.imwrite(contour_path, contour_image) os.chmod(contour_path, 0o664) shutil.chown(contour_path, group='procat') @@ -82,8 +80,9 @@ def find_shapes(image_path): return img.width, img.height, bboxes -def write_debug_image(cat_name, page_num, prods, scribbles): - path = os.path.join(WORKDIR, "debug-{}-{}.png".format(cat_name, page_num)) +def write_debug_image(subdir, cat_name, page_num, prods, scribbles): + """Draw an image with boxes for products, images, and shapes.""" + path = os.path.join(WORKDIR, subdir, f"{cat_name}-debug-page{page_num:03d}.png") pagew = int(11*72) pageh = int(8.5*72) @@ -109,3 +108,4 @@ def write_debug_image(cat_name, page_num, prods, scribbles): draw.rectangle((box.p1(pageh), box.p2(pageh)), outline="hsv(0, 22%, 100%)", width=2) img.save(path) + set_file_perms(path) diff --git a/markup/matching.py b/markup/matching.py index 908b02e..2aba05b 100644 --- a/markup/matching.py +++ b/markup/matching.py @@ -1,7 +1,25 @@ from markup.img import find_shapes, write_debug_image +from markup.pdf import parse_pdf from markup.utils import overlaps +def find_marked_products(pdf, subdir, catname, debug=0): + """Main entry point. Give a pdf, get matches.""" + (prods, scribbles) = parse_pdf(pdf, subdir, catname, debug) + + if not prods or len(prods) < 1: + print('no product placement markers found') + return None + + find_scribbles_shapes(scribbles) + matches = find_matches(prods, scribbles, 0.10) + + for s in scribbles: + write_debug_image(subdir, catname, s['page'], prods, scribbles) + + return matches + + def find_scribbles_shapes(scribbles): for scribble in scribbles: imgw, imgh, shapes = find_shapes(scribble['image']) @@ -33,6 +51,7 @@ def find_matches(all_prods, scribbles, overlap_threshold): matches = [] for s in scribbles: pagenum = s['page'] + if not pagenum in page_prods: continue prods = page_prods[pagenum] for p in prods: for box in s['bboxes']: diff --git a/markup/pdf.py b/markup/pdf.py index 1eaf6b9..905093d 100644 --- a/markup/pdf.py +++ b/markup/pdf.py @@ -9,7 +9,7 @@ from pdfminer.pdftypes import PDFObjRef, resolve1 from django.conf import settings -from .utils import pdf_rect +from .utils import pdf_rect, ensure_dir, set_file_perms WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work') @@ -34,7 +34,7 @@ def make_product_box(obj, pagenum, mediabox): return None -def make_scribble(obj, pagenum, mediabox): +def make_scribble(obj, pagenum, mediabox, subdir, name): rect = obj['Rect'] # position on page # walk the object tree down to the image @@ -51,7 +51,7 @@ def make_scribble(obj, pagenum, mediabox): flter = im1['Filter'] if flter.name == 'JPXDecode': - path = export_jp2(im1) + path = export_jp2(im1, subdir, name, pagenum) return { 'page': pagenum, 'rect': pdf_rect(rect, mediabox[3]), 'objid': im1.objid, @@ -61,33 +61,29 @@ def make_scribble(obj, pagenum, mediabox): return None -def export_jp2(obj): - jp2_path = os.path.join(WORKDIR, "export-{}.jp2".format(obj.objid)) - png_path = os.path.join(WORKDIR, "export-{}.png".format(obj.objid)) +def export_jp2(obj, subdir, name, pagenum): + oid = obj.objid + jp2_path = os.path.join(WORKDIR, subdir, f"{name}-export-page{pagenum:03d}-{oid}.jp2") + png_path = os.path.join(WORKDIR, subdir, f"{name}-export-page{pagenum:03d}-{oid}.png") - if not os.path.exists(WORKDIR): - os.makedirs(WORKDIR) - os.chmod(WORKDIR, 0o775) - shutil.chown(WORKDIR, group='procat') + ensure_dir(os.path.join(WORKDIR, subdir)) data = obj.get_rawdata() print('extracting jp2: {}'.format(jp2_path)) with open(jp2_path, 'wb') as out: out.write(data) - os.chmod(jp2_path, 0o664) - shutil.chown(jp2_path, group='procat') + set_file_perms(jp2_path) result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True) if result.returncode != 0: print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode())) else: - os.chmod(png_path, 0o664) - shutil.chown(png_path, group='procat') + set_file_perms(png_path) return png_path -def parse_pdf(fname, debug=0): +def parse_pdf(fname, subdir, name, debug=0): PDFDocument.debug = debug PDFParser.debug = debug @@ -118,7 +114,7 @@ def parse_pdf(fname, debug=0): for anno in annots: anno = resolve1(anno) if 'AAPL:AKExtras' in anno: - scribbles.append(make_scribble(anno, pagenum, mediabox)) + scribbles.append(make_scribble(anno, pagenum, mediabox, subdir, name)) elif 'ProCatName' in anno: prod_boxes.append(make_product_box(anno, pagenum, mediabox)) else: diff --git a/markup/utils.py b/markup/utils.py index 47ab266..f053d93 100644 --- a/markup/utils.py +++ b/markup/utils.py @@ -1,3 +1,7 @@ +import os +import shutil + + def pdf_rect(rect, container_height): x1 = min(rect[0], rect[2]) y1 = max(rect[1], rect[3]) @@ -64,3 +68,15 @@ class Rect(object): def __repr__(self): return 'Rect[l={}, t={}, r={}, b={}]'.format(int(self.left), int(self.top), int(self.right), int(self.bottom)) + + +def ensure_dir(dir): + if not os.path.exists(dir): + os.makedirs(dir) + os.chmod(dir, 0o775) + shutil.chown(dir, group='procat') + + +def set_file_perms(file): + os.chmod(file, 0o664) + shutil.chown(file, group='procat') diff --git a/markup/work/test_all.py b/markup/work/test_all.py new file mode 100755 index 0000000..4ed4578 --- /dev/null +++ b/markup/work/test_all.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +import sys +import os +import re +import inspect +from pathlib import Path + +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentdir = os.path.dirname(currentdir) +parentparentdir = os.path.dirname(parentdir) +sys.path.insert(0, parentparentdir) + +import dumper +import getopt +import django +from django.conf import settings + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'procat2.settings') +django.setup() + +from markup.matching import find_marked_products + + +def main(argv): + def usage(): + print('usage: %s -s subdir [-d] file.pdf' % argv[0]) + return 100 + try: + (opts, args) = getopt.getopt(argv[1:], 'd') + except getopt.GetoptError: + return usage() + if not args: return usage() + debug = 0 + subdir = 'test' + for (k, v) in opts: + if k == '-d': debug += 1 + elif k == '-s': subdir = v + + fname = args[0] + path = Path(fname) + catname = path.stem + catname = re.sub(r'[^\w]', '_', catname) + + matches = find_marked_products(fname, subdir, catname, debug=0) + print(matches) + + +if __name__ == '__main__': sys.exit(main(sys.argv))