markup: move functionality to library
This commit is contained in:
@ -11,7 +11,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from .utils import cv2_rect
|
from .utils import cv2_rect, set_file_perms
|
||||||
|
|
||||||
WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')
|
WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')
|
||||||
|
|
||||||
@ -19,6 +19,9 @@ WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')
|
|||||||
|
|
||||||
|
|
||||||
def find_shapes(image_path):
|
def find_shapes(image_path):
|
||||||
|
"""Find shapes in the image, returning bounding boxes around each.
|
||||||
|
Writes debug images next to the input image.
|
||||||
|
"""
|
||||||
path = Path(image_path)
|
path = Path(image_path)
|
||||||
|
|
||||||
img = Image.open(image_path, 'r')
|
img = Image.open(image_path, 'r')
|
||||||
@ -39,14 +42,12 @@ def find_shapes(image_path):
|
|||||||
# thresh = cv2.threshold(blurred, 60, 255, cv2.THRESH_BINARY)[1]
|
# thresh = cv2.threshold(blurred, 60, 255, cv2.THRESH_BINARY)[1]
|
||||||
|
|
||||||
thresh_path = str(path.with_suffix('.thresh.png'))
|
thresh_path = str(path.with_suffix('.thresh.png'))
|
||||||
# print('write to', thresh_path)
|
|
||||||
cv2.imwrite(thresh_path, threshold)
|
cv2.imwrite(thresh_path, threshold)
|
||||||
os.chmod(thresh_path, 0o664)
|
os.chmod(thresh_path, 0o664)
|
||||||
shutil.chown(thresh_path, group='procat')
|
shutil.chown(thresh_path, group='procat')
|
||||||
|
|
||||||
contours = cv2.findContours(threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
contours = cv2.findContours(threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
contours = imutils.grab_contours(contours)
|
contours = imutils.grab_contours(contours)
|
||||||
# print("{} shapes".format(len(contours)))
|
|
||||||
|
|
||||||
bboxes = []
|
bboxes = []
|
||||||
for c in contours:
|
for c in contours:
|
||||||
@ -58,13 +59,11 @@ def find_shapes(image_path):
|
|||||||
# if M["m00"] == 0: M["m00"] = 0.00001
|
# if M["m00"] == 0: M["m00"] = 0.00001
|
||||||
# cX = int(M["m10"] / M["m00"])
|
# cX = int(M["m10"] / M["m00"])
|
||||||
# cY = int(M["m01"] / M["m00"])
|
# cY = int(M["m01"] / M["m00"])
|
||||||
#print('add contour rect: {}'.format(cv2_rect(x, y, w, h)))
|
|
||||||
bboxes.append(cv2_rect(x, y, w, h))
|
bboxes.append(cv2_rect(x, y, w, h))
|
||||||
|
|
||||||
# draw contours
|
# draw contours
|
||||||
contour_image = numpy.zeros((threshold.shape[0], threshold.shape[1], 3), dtype=numpy.uint8)
|
contour_image = numpy.zeros((threshold.shape[0], threshold.shape[1], 3), dtype=numpy.uint8)
|
||||||
for i in range(len(contours)):
|
for i in range(len(contours)):
|
||||||
# compute the center of the contour
|
|
||||||
color = (rng.randint(0,512), rng.randint(0,512), rng.randint(0,512))
|
color = (rng.randint(0,512), rng.randint(0,512), rng.randint(0,512))
|
||||||
cv2.drawContours(contour_image, contours, i, color)
|
cv2.drawContours(contour_image, contours, i, color)
|
||||||
rect = bboxes[i]
|
rect = bboxes[i]
|
||||||
@ -74,7 +73,6 @@ def find_shapes(image_path):
|
|||||||
# cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
|
# cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
|
||||||
|
|
||||||
contour_path = str(path.with_suffix('.contour.png'))
|
contour_path = str(path.with_suffix('.contour.png'))
|
||||||
#print('write to', contour_path)
|
|
||||||
cv2.imwrite(contour_path, contour_image)
|
cv2.imwrite(contour_path, contour_image)
|
||||||
os.chmod(contour_path, 0o664)
|
os.chmod(contour_path, 0o664)
|
||||||
shutil.chown(contour_path, group='procat')
|
shutil.chown(contour_path, group='procat')
|
||||||
@ -82,8 +80,9 @@ def find_shapes(image_path):
|
|||||||
return img.width, img.height, bboxes
|
return img.width, img.height, bboxes
|
||||||
|
|
||||||
|
|
||||||
def write_debug_image(cat_name, page_num, prods, scribbles):
|
def write_debug_image(subdir, cat_name, page_num, prods, scribbles):
|
||||||
path = os.path.join(WORKDIR, "debug-{}-{}.png".format(cat_name, page_num))
|
"""Draw an image with boxes for products, images, and shapes."""
|
||||||
|
path = os.path.join(WORKDIR, subdir, f"{cat_name}-debug-page{page_num:03d}.png")
|
||||||
|
|
||||||
pagew = int(11*72)
|
pagew = int(11*72)
|
||||||
pageh = int(8.5*72)
|
pageh = int(8.5*72)
|
||||||
@ -109,3 +108,4 @@ def write_debug_image(cat_name, page_num, prods, scribbles):
|
|||||||
draw.rectangle((box.p1(pageh), box.p2(pageh)), outline="hsv(0, 22%, 100%)", width=2)
|
draw.rectangle((box.p1(pageh), box.p2(pageh)), outline="hsv(0, 22%, 100%)", width=2)
|
||||||
|
|
||||||
img.save(path)
|
img.save(path)
|
||||||
|
set_file_perms(path)
|
||||||
|
|||||||
@ -1,7 +1,25 @@
|
|||||||
from markup.img import find_shapes, write_debug_image
|
from markup.img import find_shapes, write_debug_image
|
||||||
|
from markup.pdf import parse_pdf
|
||||||
from markup.utils import overlaps
|
from markup.utils import overlaps
|
||||||
|
|
||||||
|
|
||||||
|
def find_marked_products(pdf, subdir, catname, debug=0):
|
||||||
|
"""Main entry point. Give a pdf, get matches."""
|
||||||
|
(prods, scribbles) = parse_pdf(pdf, subdir, catname, debug)
|
||||||
|
|
||||||
|
if not prods or len(prods) < 1:
|
||||||
|
print('no product placement markers found')
|
||||||
|
return None
|
||||||
|
|
||||||
|
find_scribbles_shapes(scribbles)
|
||||||
|
matches = find_matches(prods, scribbles, 0.10)
|
||||||
|
|
||||||
|
for s in scribbles:
|
||||||
|
write_debug_image(subdir, catname, s['page'], prods, scribbles)
|
||||||
|
|
||||||
|
return matches
|
||||||
|
|
||||||
|
|
||||||
def find_scribbles_shapes(scribbles):
|
def find_scribbles_shapes(scribbles):
|
||||||
for scribble in scribbles:
|
for scribble in scribbles:
|
||||||
imgw, imgh, shapes = find_shapes(scribble['image'])
|
imgw, imgh, shapes = find_shapes(scribble['image'])
|
||||||
@ -33,6 +51,7 @@ def find_matches(all_prods, scribbles, overlap_threshold):
|
|||||||
matches = []
|
matches = []
|
||||||
for s in scribbles:
|
for s in scribbles:
|
||||||
pagenum = s['page']
|
pagenum = s['page']
|
||||||
|
if not pagenum in page_prods: continue
|
||||||
prods = page_prods[pagenum]
|
prods = page_prods[pagenum]
|
||||||
for p in prods:
|
for p in prods:
|
||||||
for box in s['bboxes']:
|
for box in s['bboxes']:
|
||||||
|
|||||||
@ -9,7 +9,7 @@ from pdfminer.pdftypes import PDFObjRef, resolve1
|
|||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from .utils import pdf_rect
|
from .utils import pdf_rect, ensure_dir, set_file_perms
|
||||||
|
|
||||||
WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')
|
WORKDIR = os.path.join(settings.ASSET_DIR, 'markup', 'work')
|
||||||
|
|
||||||
@ -34,7 +34,7 @@ def make_product_box(obj, pagenum, mediabox):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def make_scribble(obj, pagenum, mediabox):
|
def make_scribble(obj, pagenum, mediabox, subdir, name):
|
||||||
rect = obj['Rect'] # position on page
|
rect = obj['Rect'] # position on page
|
||||||
|
|
||||||
# walk the object tree down to the image
|
# walk the object tree down to the image
|
||||||
@ -51,7 +51,7 @@ def make_scribble(obj, pagenum, mediabox):
|
|||||||
|
|
||||||
flter = im1['Filter']
|
flter = im1['Filter']
|
||||||
if flter.name == 'JPXDecode':
|
if flter.name == 'JPXDecode':
|
||||||
path = export_jp2(im1)
|
path = export_jp2(im1, subdir, name, pagenum)
|
||||||
return { 'page': pagenum,
|
return { 'page': pagenum,
|
||||||
'rect': pdf_rect(rect, mediabox[3]),
|
'rect': pdf_rect(rect, mediabox[3]),
|
||||||
'objid': im1.objid,
|
'objid': im1.objid,
|
||||||
@ -61,33 +61,29 @@ def make_scribble(obj, pagenum, mediabox):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def export_jp2(obj):
|
def export_jp2(obj, subdir, name, pagenum):
|
||||||
jp2_path = os.path.join(WORKDIR, "export-{}.jp2".format(obj.objid))
|
oid = obj.objid
|
||||||
png_path = os.path.join(WORKDIR, "export-{}.png".format(obj.objid))
|
jp2_path = os.path.join(WORKDIR, subdir, f"{name}-export-page{pagenum:03d}-{oid}.jp2")
|
||||||
|
png_path = os.path.join(WORKDIR, subdir, f"{name}-export-page{pagenum:03d}-{oid}.png")
|
||||||
|
|
||||||
if not os.path.exists(WORKDIR):
|
ensure_dir(os.path.join(WORKDIR, subdir))
|
||||||
os.makedirs(WORKDIR)
|
|
||||||
os.chmod(WORKDIR, 0o775)
|
|
||||||
shutil.chown(WORKDIR, group='procat')
|
|
||||||
|
|
||||||
data = obj.get_rawdata()
|
data = obj.get_rawdata()
|
||||||
print('extracting jp2: {}'.format(jp2_path))
|
print('extracting jp2: {}'.format(jp2_path))
|
||||||
with open(jp2_path, 'wb') as out:
|
with open(jp2_path, 'wb') as out:
|
||||||
out.write(data)
|
out.write(data)
|
||||||
os.chmod(jp2_path, 0o664)
|
set_file_perms(jp2_path)
|
||||||
shutil.chown(jp2_path, group='procat')
|
|
||||||
|
|
||||||
result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True)
|
result = subprocess.run(['opj_decompress', '-i', jp2_path, '-o', png_path], capture_output=True)
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode()))
|
print('ERROR converting {}:\n{}\n{}'.format(jp2_path, result.stdout.decode(), result.stderr.decode()))
|
||||||
else:
|
else:
|
||||||
os.chmod(png_path, 0o664)
|
set_file_perms(png_path)
|
||||||
shutil.chown(png_path, group='procat')
|
|
||||||
|
|
||||||
return png_path
|
return png_path
|
||||||
|
|
||||||
|
|
||||||
def parse_pdf(fname, debug=0):
|
def parse_pdf(fname, subdir, name, debug=0):
|
||||||
PDFDocument.debug = debug
|
PDFDocument.debug = debug
|
||||||
PDFParser.debug = debug
|
PDFParser.debug = debug
|
||||||
|
|
||||||
@ -118,7 +114,7 @@ def parse_pdf(fname, debug=0):
|
|||||||
for anno in annots:
|
for anno in annots:
|
||||||
anno = resolve1(anno)
|
anno = resolve1(anno)
|
||||||
if 'AAPL:AKExtras' in anno:
|
if 'AAPL:AKExtras' in anno:
|
||||||
scribbles.append(make_scribble(anno, pagenum, mediabox))
|
scribbles.append(make_scribble(anno, pagenum, mediabox, subdir, name))
|
||||||
elif 'ProCatName' in anno:
|
elif 'ProCatName' in anno:
|
||||||
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
|
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -1,3 +1,7 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
def pdf_rect(rect, container_height):
|
def pdf_rect(rect, container_height):
|
||||||
x1 = min(rect[0], rect[2])
|
x1 = min(rect[0], rect[2])
|
||||||
y1 = max(rect[1], rect[3])
|
y1 = max(rect[1], rect[3])
|
||||||
@ -64,3 +68,15 @@ class Rect(object):
|
|||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return 'Rect[l={}, t={}, r={}, b={}]'.format(int(self.left), int(self.top), int(self.right), int(self.bottom))
|
return 'Rect[l={}, t={}, r={}, b={}]'.format(int(self.left), int(self.top), int(self.right), int(self.bottom))
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_dir(dir):
|
||||||
|
if not os.path.exists(dir):
|
||||||
|
os.makedirs(dir)
|
||||||
|
os.chmod(dir, 0o775)
|
||||||
|
shutil.chown(dir, group='procat')
|
||||||
|
|
||||||
|
|
||||||
|
def set_file_perms(file):
|
||||||
|
os.chmod(file, 0o664)
|
||||||
|
shutil.chown(file, group='procat')
|
||||||
|
|||||||
49
markup/work/test_all.py
Executable file
49
markup/work/test_all.py
Executable file
@ -0,0 +1,49 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import inspect
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
||||||
|
parentdir = os.path.dirname(currentdir)
|
||||||
|
parentparentdir = os.path.dirname(parentdir)
|
||||||
|
sys.path.insert(0, parentparentdir)
|
||||||
|
|
||||||
|
import dumper
|
||||||
|
import getopt
|
||||||
|
import django
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'procat2.settings')
|
||||||
|
django.setup()
|
||||||
|
|
||||||
|
from markup.matching import find_marked_products
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv):
|
||||||
|
def usage():
|
||||||
|
print('usage: %s -s subdir [-d] file.pdf' % argv[0])
|
||||||
|
return 100
|
||||||
|
try:
|
||||||
|
(opts, args) = getopt.getopt(argv[1:], 'd')
|
||||||
|
except getopt.GetoptError:
|
||||||
|
return usage()
|
||||||
|
if not args: return usage()
|
||||||
|
debug = 0
|
||||||
|
subdir = 'test'
|
||||||
|
for (k, v) in opts:
|
||||||
|
if k == '-d': debug += 1
|
||||||
|
elif k == '-s': subdir = v
|
||||||
|
|
||||||
|
fname = args[0]
|
||||||
|
path = Path(fname)
|
||||||
|
catname = path.stem
|
||||||
|
catname = re.sub(r'[^\w]', '_', catname)
|
||||||
|
|
||||||
|
matches = find_marked_products(fname, subdir, catname, debug=0)
|
||||||
|
print(matches)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
Reference in New Issue
Block a user