Files
procat2/markup/matching.py

63 lines
1.8 KiB
Python

from .img import find_shapes, write_debug_image
from .pdf import parse_pdf
from .utils import overlaps
def find_marked_products(pdf, workdir, debug=0):
"""Main entry point. Give a pdf, get matches."""
(prods, scribbles) = parse_pdf(pdf, workdir, debug)
if not prods or len(prods) < 1:
print('no product placement markers found')
return None
find_scribbles_shapes(scribbles)
matches = find_matches(prods, scribbles, 0.10)
for s in scribbles:
write_debug_image(workdir, s['page'], prods, scribbles)
return matches
def find_scribbles_shapes(scribbles):
for scribble in scribbles:
imgw, imgh, shapes = find_shapes(scribble['image'])
rects = [transform(scribble['rect'], imgw, imgh, s) for s in shapes]
scribble['bboxes'] = rects
def transform(pdf_rect, imgw, imgh, shape):
"""Convert scribble from image coords to pdf coords"""
# get scale factor for image coords
# to convert to pdf coordinates
pdfw = pdf_rect.right - pdf_rect.left
pdfh = pdf_rect.bottom - pdf_rect.top
scalew = pdfw / imgw
scaleh = pdfh / imgh
return shape.scale(scalew, scaleh).translate(pdf_rect.left, pdf_rect.top)
def find_matches(all_prods, scribbles, overlap_threshold):
# segment by page
page_prods = {}
for p in all_prods:
pagenum = p['page']
if pagenum in page_prods:
page_prods[pagenum].append(p)
else:
page_prods[pagenum] = [p]
matches = []
for s in scribbles:
pagenum = s['page']
if not pagenum in page_prods: continue
prods = page_prods[pagenum]
for p in prods:
for box in s['bboxes']:
if overlaps(p['rect'], box, overlap_threshold):
p['matched'] = s
matches.append(p)
return matches