markup: move functionality to library

2019-10-18 15:14:05 -07:00
parent 5282f7cb2f
commit 94c1a419dc
5 changed files with 104 additions and 24 deletions
--- a/markup/img.py
+++ b/markup/img.py
@ -11,7 +11,7 @@ from pathlib import Path

 from django.conf import settings

-from .utils import cv2_rect
+from .utils import cv2_rect, set_file_perms

 WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')

@ -19,6 +19,9 @@ WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')


 def find_shapes(image_path):
+    """Find shapes in the image, returning bounding boxes around each.
+    Writes debug images next to the input image.
+    """
    path = Path(image_path)

    img = Image.open(image_path, 'r')
@ -39,14 +42,12 @@ def find_shapes(image_path):
    # thresh = cv2.threshold(blurred, 60, 255, cv2.THRESH_BINARY)[1]

    thresh_path = str(path.with_suffix('.thresh.png'))
-    # print('write to', thresh_path)
    cv2.imwrite(thresh_path, threshold)
    os.chmod(thresh_path, 0o664)
    shutil.chown(thresh_path, group='procat')

    contours = cv2.findContours(threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = imutils.grab_contours(contours)
-    # print("{} shapes".format(len(contours)))

    bboxes = []
    for c in contours:
@ -58,13 +59,11 @@ def find_shapes(image_path):
        # if M["m00"] == 0: M["m00"] = 0.00001
        # cX = int(M["m10"] / M["m00"])
        # cY = int(M["m01"] / M["m00"])
-        #print('add contour rect: {}'.format(cv2_rect(x, y, w, h)))
        bboxes.append(cv2_rect(x, y, w, h))

    # draw contours
    contour_image = numpy.zeros((threshold.shape[0], threshold.shape[1], 3), dtype=numpy.uint8)
    for i in range(len(contours)):
-        # compute the center of the contour
        color = (rng.randint(0,512), rng.randint(0,512), rng.randint(0,512))
        cv2.drawContours(contour_image, contours, i, color)
        rect = bboxes[i]
@ -74,7 +73,6 @@ def find_shapes(image_path):
        #             cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)

    contour_path = str(path.with_suffix('.contour.png'))
-    #print('write to', contour_path)
    cv2.imwrite(contour_path, contour_image)
    os.chmod(contour_path, 0o664)
    shutil.chown(contour_path, group='procat')
@ -82,8 +80,9 @@ def find_shapes(image_path):
    return img.width, img.height, bboxes


-def write_debug_image(cat_name, page_num, prods, scribbles):
-    path = os.path.join(WORKDIR, "debug-{}-{}.png".format(cat_name, page_num))
+def write_debug_image(subdir, cat_name, page_num, prods, scribbles):
+    """Draw an image with boxes for products, images, and shapes."""
+    path = os.path.join(WORKDIR, subdir, f"{cat_name}-debug-page{page_num:03d}.png")

    pagew = int(11*72)
    pageh = int(8.5*72)
@ -109,3 +108,4 @@ def write_debug_image(cat_name, page_num, prods, scribbles):
            draw.rectangle((box.p1(pageh), box.p2(pageh)), outline="hsv(0, 22%, 100%)", width=2)

    img.save(path)
+    set_file_perms(path)
--- a/markup/matching.py
+++ b/markup/matching.py
@ -1,7 +1,25 @@
 from markup.img import find_shapes, write_debug_image
+from markup.pdf import parse_pdf
 from markup.utils import overlaps


+def find_marked_products(pdf, subdir, catname, debug=0):
+    """Main entry point.  Give a pdf, get matches."""
+    (prods, scribbles) = parse_pdf(pdf, subdir, catname, debug)
+
+    if not prods or len(prods) < 1:
+        print('no product placement markers found')
+        return None
+
+    find_scribbles_shapes(scribbles)
+    matches = find_matches(prods, scribbles, 0.10)
+
+    for s in scribbles:
+        write_debug_image(subdir, catname, s['page'], prods, scribbles)
+
+    return matches
+
+
 def find_scribbles_shapes(scribbles):
    for scribble in scribbles:
        imgw, imgh, shapes = find_shapes(scribble['image'])
@ -33,6 +51,7 @@ def find_matches(all_prods, scribbles, overlap_threshold):
    matches = []
    for s in scribbles:
        pagenum = s['page']
+        if not pagenum in page_prods: continue
        prods = page_prods[pagenum]
        for p in prods:
            for box in s['bboxes']:
--- a/markup/pdf.py
+++ b/markup/pdf.py
@ -9,7 +9,7 @@ from pdfminer.pdftypes import PDFObjRef, resolve1

 from django.conf import settings

-from .utils import pdf_rect
+from .utils import pdf_rect, ensure_dir, set_file_perms

 WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')

@ -34,7 +34,7 @@ def make_product_box(obj, pagenum, mediabox):
        return None


-def make_scribble(obj, pagenum, mediabox):
+def make_scribble(obj, pagenum, mediabox, subdir, name):
    rect = obj['Rect'] # position on page

    # walk the object tree down to the image
@ -51,7 +51,7 @@ def make_scribble(obj, pagenum, mediabox):

    flter = im1['Filter']
    if flter.name == 'JPXDecode':
-        path = export_jp2(im1)
+        path = export_jp2(im1, subdir, name, pagenum)
        return { 'page': pagenum,
                 'rect': pdf_rect(rect, mediabox[3]),
                 'objid': im1.objid,
@ -61,33 +61,29 @@ def make_scribble(obj, pagenum, mediabox):
        return None


-def export_jp2(obj):
-    jp2_path = os.path.join(WORKDIR, "export-{}.jp2".format(obj.objid))
-    png_path = os.path.join(WORKDIR, "export-{}.png".format(obj.objid))
+def export_jp2(obj, subdir, name, pagenum):
+    oid = obj.objid
+    jp2_path = os.path.join(WORKDIR, subdir, f"{name}-export-page{pagenum:03d}-{oid}.jp2")
+    png_path = os.path.join(WORKDIR, subdir, f"{name}-export-page{pagenum:03d}-{oid}.png")

-    if not os.path.exists(WORKDIR):
-        os.makedirs(WORKDIR)
-        os.chmod(WORKDIR, 0o775)
-        shutil.chown(WORKDIR, group='procat')
+    ensure_dir(os.path.join(WORKDIR, subdir))

    data = obj.get_rawdata()
    print('extracting jp2: {}'.format(jp2_path))
    with open(jp2_path, 'wb') as out:
        out.write(data)
-        os.chmod(jp2_path, 0o664)
-        shutil.chown(jp2_path, group='procat')
+        set_file_perms(jp2_path)

    result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True)
    if result.returncode != 0:
        print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode()))
    else:
-        os.chmod(png_path, 0o664)
-        shutil.chown(png_path, group='procat')
+        set_file_perms(png_path)

    return png_path


-def parse_pdf(fname, debug=0):
+def parse_pdf(fname, subdir, name, debug=0):
    PDFDocument.debug = debug
    PDFParser.debug = debug

@ -118,7 +114,7 @@ def parse_pdf(fname, debug=0):
        for anno in annots:
            anno = resolve1(anno)
            if 'AAPL:AKExtras' in anno:
-                scribbles.append(make_scribble(anno, pagenum, mediabox))
+                scribbles.append(make_scribble(anno, pagenum, mediabox, subdir, name))
            elif 'ProCatName' in anno:
                prod_boxes.append(make_product_box(anno, pagenum, mediabox))
            else:
--- a/markup/utils.py
+++ b/markup/utils.py
@ -1,3 +1,7 @@
+import os
+import shutil
+
+
 def pdf_rect(rect, container_height):
    x1 = min(rect[0], rect[2])
    y1 = max(rect[1], rect[3])
@ -64,3 +68,15 @@ class Rect(object):

    def __repr__(self):
        return 'Rect[l={}, t={}, r={}, b={}]'.format(int(self.left), int(self.top), int(self.right), int(self.bottom))
+
+
+def ensure_dir(dir):
+    if not os.path.exists(dir):
+        os.makedirs(dir)
+    os.chmod(dir, 0o775)
+    shutil.chown(dir, group='procat')
+
+
+def set_file_perms(file):
+    os.chmod(file, 0o664)
+    shutil.chown(file, group='procat')
--- a/markup/work/test_all.py
+++ b/markup/work/test_all.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+import re
+import inspect
+from pathlib import Path
+
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(currentdir)
+parentparentdir = os.path.dirname(parentdir)
+sys.path.insert(0, parentparentdir)
+
+import dumper
+import getopt
+import django
+from django.conf import settings
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'procat2.settings')
+django.setup()
+
+from markup.matching import find_marked_products
+
+
+def main(argv):
+    def usage():
+        print('usage: %s -s subdir [-d] file.pdf' % argv[0])
+        return 100
+    try:
+        (opts, args) = getopt.getopt(argv[1:], 'd')
+    except getopt.GetoptError:
+        return usage()
+    if not args: return usage()
+    debug = 0
+    subdir = 'test'
+    for (k, v) in opts:
+        if k == '-d': debug += 1
+        elif k == '-s': subdir = v
+
+    fname = args[0]
+    path = Path(fname)
+    catname = path.stem
+    catname = re.sub(r'[^\w]', '_', catname)
+
+    matches = find_marked_products(fname, subdir, catname, debug=0)
+    print(matches)
+
+
+if __name__ == '__main__': sys.exit(main(sys.argv))