Merge branch 'markup_documents'

This commit is contained in:
2020-02-28 17:16:52 -08:00
4 changed files with 128 additions and 9 deletions

View File

@ -85,6 +85,10 @@ def send_error_email(subj, einfo):
def send(frm, subj, msg): def send(frm, subj, msg):
if not EMAIL_HOST:
log.info(f'not sending email')
return
msg['From'] = 'Keen ProCatalog Markup Bot <markup@procatalog.io>' msg['From'] = 'Keen ProCatalog Markup Bot <markup@procatalog.io>'
msg['Reply-To'] = 'Keen ProCatalog Support <support@procatalog.io>' msg['Reply-To'] = 'Keen ProCatalog Support <support@procatalog.io>'
msg['To'] = frm msg['To'] = frm

View File

@ -9,6 +9,8 @@ import dumper
import random as rng import random as rng
from pathlib import Path from pathlib import Path
from pdfminer.psparser import LIT
from .utils import cv2_rect, ensure_dir, set_file_perms, WORKDIR from .utils import cv2_rect, ensure_dir, set_file_perms, WORKDIR
# https://www.pyimagesearch.com/2014/10/20/finding-shapes-images-using-python-opencv/ # https://www.pyimagesearch.com/2014/10/20/finding-shapes-images-using-python-opencv/
@ -110,3 +112,43 @@ def write_debug_image(workdir, page_num, prods, scribbles):
img.save(path) img.save(path)
set_file_perms(path) set_file_perms(path)
def write_inklist(obj, mediabox, path):
"""Draw an image of the inklist."""
pagew = mediabox[2] - mediabox[0]
pageh = mediabox[3] - mediabox[1]
img = Image.new('RGBA', (pagew, pageh), (0, 0, 0, 0))
draw = ImageDraw.Draw(img, 'RGBA')
for segment in obj['InkList']:
draw.line(segment, 'black', 3)
# account for the difference in coordinate systems
# between pdf and images.
img = img.transpose(Image.FLIP_TOP_BOTTOM)
img.save(path)
set_file_perms(path)
def write_square_or_circle(obj, mediabox, path):
"""Draw an image of the inklist."""
pagew = mediabox[2] - mediabox[0]
pageh = mediabox[3] - mediabox[1]
img = Image.new('RGBA', (pagew, pageh), (0, 0, 0, 0))
draw = ImageDraw.Draw(img, 'RGBA')
if obj["Subtype"] == LIT('Square'):
draw.rectangle(obj['Rect'], fill=None, outline='black', width=3)
else:
draw.ellipse(*obj['Rect'], fill=None, outline='black', width=3)
# account for the difference in coordinate systems
# between pdf and images.
img = img.transpose(Image.FLIP_TOP_BOTTOM)
img.save(path)
set_file_perms(path)

View File

@ -9,7 +9,8 @@ from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import PDFObjRef, resolve1 from pdfminer.pdftypes import PDFObjRef, resolve1
from .utils import pdf_rect, ensure_dir, set_file_perms from .utils import Rect, pdf_rect, ensure_dir, set_file_perms
from .img import write_inklist, write_square_or_circle
def make_product_box(obj, pagenum, mediabox): def make_product_box(obj, pagenum, mediabox):
@ -39,7 +40,31 @@ def make_product_box(obj, pagenum, mediabox):
return None return None
def make_scribble(obj, pagenum, mediabox, workdir): def make_ink_scribble(obj, pagenum, mediabox, workdir):
oid = obj['NM'].decode('utf-8')
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png")
write_inklist(obj, mediabox, png_path)
return { 'page': pagenum,
'rect': Rect(*mediabox),
'objid': oid,
'image': png_path }
def make_square_or_circle_scribble(obj, pagenum, mediabox, workdir):
oid = obj['NM'].decode('utf-8')
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png")
write_square_or_circle(obj, mediabox, png_path)
return { 'page': pagenum,
'rect': Rect(*mediabox),
'objid': oid,
'image': png_path }
def make_aapl_scribble(obj, pagenum, mediabox, workdir):
rect = obj['Rect'] # position on page rect = obj['Rect'] # position on page
# walk the object tree down to the image # walk the object tree down to the image
@ -143,6 +168,17 @@ def write_pbm(obj, base_path):
return path return path
def is_inklist_annotation(anno):
return 'Subtype' in anno and anno["Subtype"] == LIT('Ink')
def is_square_or_circle_annotation(anno):
if 'Subtype' in anno:
if anno["Subtype"] == LIT('Square') or anno["Subtype"] == LIT('Circle'):
return True
return False
def parse_pdf(fname, workdir, debug=0): def parse_pdf(fname, workdir, debug=0):
PDFDocument.debug = debug PDFDocument.debug = debug
PDFParser.debug = debug PDFParser.debug = debug
@ -173,10 +209,18 @@ def parse_pdf(fname, workdir, debug=0):
for anno in annots: for anno in annots:
anno = resolve1(anno) anno = resolve1(anno)
if 'AAPL:AKExtras' in anno: if is_inklist_annotation(anno):
scribbles.append(make_scribble(anno, pagenum, mediabox, workdir)) scribbles.append(make_ink_scribble(anno, pagenum, mediabox, workdir))
elif is_square_or_circle_annotation(anno):
scribbles.append(make_square_or_circle_scribble(anno, pagenum, mediabox, workdir))
elif 'AAPL:AKExtras' in anno:
scribbles.append(make_aapl_scribble(anno, pagenum, mediabox, workdir))
elif 'ProCatName' in anno: elif 'ProCatName' in anno:
prod_boxes.append(make_product_box(anno, pagenum, mediabox)) prod_boxes.append(make_product_box(anno, pagenum, mediabox))
elif anno['Subtype'] == LIT('FreeText'):
print('ignoring FreeText annotation')
elif anno['Subtype'] == LIT('Highlight'):
print('ignoring Highlight annotation')
else: else:
print('ignoring other annotation:') print('ignoring other annotation:')
print(anno) print(anno)

View File

@ -2,13 +2,16 @@ from __future__ import absolute_import, unicode_literals
from celery import task, shared_task from celery import task, shared_task
from celery.utils.log import get_task_logger from celery.utils.log import get_task_logger
import os
import re
import sys
import datetime import datetime
import fileinput import fileinput
import os
import re
import shutil
import smtplib import smtplib
import sys
from pathlib import Path from pathlib import Path
from os.path import basename, dirname, isfile
from email.feedparser import FeedParser from email.feedparser import FeedParser
from email.message import EmailMessage from email.message import EmailMessage
@ -75,7 +78,6 @@ def process_attachment(from_address, subject, attachment):
print(f'Using pdf name: {pdf_name}') print(f'Using pdf name: {pdf_name}')
pdf_base = Path(pdf_name).stem pdf_base = Path(pdf_name).stem
workdir = os.path.join(WORKDIR, clean_path(from_address), pdf_base) workdir = os.path.join(WORKDIR, clean_path(from_address), pdf_base)
ensure_dir(workdir) ensure_dir(workdir)
pdf_path = os.path.join(workdir, pdf_name) pdf_path = os.path.join(workdir, pdf_name)
@ -84,6 +86,32 @@ def process_attachment(from_address, subject, attachment):
att.write(attachment.get_payload(decode=True)) att.write(attachment.get_payload(decode=True))
set_file_perms(pdf_path) set_file_perms(pdf_path)
process_pdf(pdf_path, from_address, subject, workdir)
@shared_task(on_failure=on_fail_handler)
def process_markup_pdf(pdf_path, user):
if not Path(pdf_path).is_file():
print(f'No pdf - exiting ({pdf_path})')
return
pdf_stem = Path(pdf_path).stem
workdir = os.path.join(WORKDIR, clean_path(user.username), clean_path(pdf_stem))
ensure_dir(workdir)
pdf_name = Path(pdf_path).name
dest_path = os.path.join(workdir, pdf_name)
print(f'copying pdf to {dest_path}')
shutil.copy(pdf_path, dest_path)
set_file_perms(dest_path)
frm = str(make_header(decode_header(f'{user.get_full_name()} <{user.email}>')))
subject = str(make_header(decode_header(pdf_name)))
process_pdf(dest_path, frm, subject, workdir)
def process_pdf(pdf_path, from_address, subject, workdir):
# find matches # find matches
matches = find_marked_products(pdf_path, workdir, debug=0) matches = find_marked_products(pdf_path, workdir, debug=0)
if not matches: if not matches:
@ -94,7 +122,8 @@ def process_attachment(from_address, subject, attachment):
print(f'{len(matches)} product matches') print(f'{len(matches)} product matches')
# write spreadsheet # write spreadsheet
xls_path = write_spreadsheet(matches, workdir, pdf_base) pdf_stem = Path(pdf_path).stem
xls_path = write_spreadsheet(matches, workdir, pdf_stem)
if xls_path: if xls_path:
# send reply # send reply