Merge branch 'markup_documents'

2020-02-28 17:16:52 -08:00
parent ed013483b5 b625b4f16d
commit 1e8c71a603
4 changed files with 128 additions and 9 deletions
--- a/markup/email.py
+++ b/markup/email.py
@ -85,6 +85,10 @@ def send_error_email(subj, einfo):
 def send(frm, subj, msg):
    if not EMAIL_HOST:
        log.info(f'not sending email')
        return
    msg['From'] = 'Keen ProCatalog Markup Bot <markup@procatalog.io>'
    msg['Reply-To'] = 'Keen ProCatalog Support <support@procatalog.io>'
    msg['To'] = frm
--- a/markup/img.py
+++ b/markup/img.py
@ -9,6 +9,8 @@ import dumper
 import random as rng
 from pathlib import Path
 from pdfminer.psparser import LIT
 from .utils import cv2_rect, ensure_dir, set_file_perms, WORKDIR
 # https://www.pyimagesearch.com/2014/10/20/finding-shapes-images-using-python-opencv/
@ -110,3 +112,43 @@ def write_debug_image(workdir, page_num, prods, scribbles):
    img.save(path)
    set_file_perms(path)
 def write_inklist(obj, mediabox, path):
    """Draw an image of the inklist."""
    pagew = mediabox[2] - mediabox[0]
    pageh = mediabox[3] - mediabox[1]
    img = Image.new('RGBA', (pagew, pageh), (0, 0, 0, 0))
    draw = ImageDraw.Draw(img, 'RGBA')
    for segment in obj['InkList']:
        draw.line(segment, 'black', 3)
    # account for the difference in coordinate systems
    # between pdf and images.
    img = img.transpose(Image.FLIP_TOP_BOTTOM)
    img.save(path)
    set_file_perms(path)
 def write_square_or_circle(obj, mediabox, path):
    """Draw an image of the inklist."""
    pagew = mediabox[2] - mediabox[0]
    pageh = mediabox[3] - mediabox[1]
    img = Image.new('RGBA', (pagew, pageh), (0, 0, 0, 0))
    draw = ImageDraw.Draw(img, 'RGBA')
    if obj["Subtype"] == LIT('Square'):
        draw.rectangle(obj['Rect'], fill=None, outline='black', width=3)
    else:
        draw.ellipse(*obj['Rect'], fill=None, outline='black', width=3)
    # account for the difference in coordinate systems
    # between pdf and images.
    img = img.transpose(Image.FLIP_TOP_BOTTOM)
    img.save(path)
    set_file_perms(path)
--- a/markup/pdf.py
+++ b/markup/pdf.py
@ -9,7 +9,8 @@ from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdftypes import PDFObjRef, resolve1
-from .utils import pdf_rect, ensure_dir, set_file_perms
+from .utils import Rect, pdf_rect, ensure_dir, set_file_perms
 from .img import write_inklist, write_square_or_circle
 def make_product_box(obj, pagenum, mediabox):
@ -39,7 +40,31 @@ def make_product_box(obj, pagenum, mediabox):
        return None
-def make_scribble(obj, pagenum, mediabox, workdir):
+def make_ink_scribble(obj, pagenum, mediabox, workdir):
    oid = obj['NM'].decode('utf-8')
    png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png")
    write_inklist(obj, mediabox, png_path)
    return { 'page': pagenum,
             'rect': Rect(*mediabox),
             'objid': oid,
             'image': png_path }
 def make_square_or_circle_scribble(obj, pagenum, mediabox, workdir):
    oid = obj['NM'].decode('utf-8')
    png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png")
    write_square_or_circle(obj, mediabox, png_path)
    return { 'page': pagenum,
             'rect': Rect(*mediabox),
             'objid': oid,
             'image': png_path }
 def make_aapl_scribble(obj, pagenum, mediabox, workdir):
    rect = obj['Rect'] # position on page
    # walk the object tree down to the image
@ -143,6 +168,17 @@ def write_pbm(obj, base_path):
    return path
 def is_inklist_annotation(anno):
    return 'Subtype' in anno and anno["Subtype"] == LIT('Ink')
 def is_square_or_circle_annotation(anno):
    if 'Subtype' in anno:
        if anno["Subtype"] == LIT('Square') or anno["Subtype"] == LIT('Circle'):
            return True
    return False
 def parse_pdf(fname, workdir, debug=0):
    PDFDocument.debug = debug
    PDFParser.debug = debug
@ -173,10 +209,18 @@ def parse_pdf(fname, workdir, debug=0):
        for anno in annots:
            anno = resolve1(anno)
-            if 'AAPL:AKExtras' in anno:
+            if is_inklist_annotation(anno):
-                scribbles.append(make_scribble(anno, pagenum, mediabox, workdir))
+                scribbles.append(make_ink_scribble(anno, pagenum, mediabox, workdir))
            elif is_square_or_circle_annotation(anno):
                scribbles.append(make_square_or_circle_scribble(anno, pagenum, mediabox, workdir))
            elif 'AAPL:AKExtras' in anno:
                scribbles.append(make_aapl_scribble(anno, pagenum, mediabox, workdir))
            elif 'ProCatName' in anno:
                prod_boxes.append(make_product_box(anno, pagenum, mediabox))
            elif anno['Subtype'] == LIT('FreeText'):
                print('ignoring FreeText annotation')
            elif anno['Subtype'] == LIT('Highlight'):
                print('ignoring Highlight annotation')
            else:
                print('ignoring other annotation:')
                print(anno)
--- a/markup/tasks.py
+++ b/markup/tasks.py
@ -2,13 +2,16 @@ from __future__ import absolute_import, unicode_literals
 from celery import task, shared_task
 from celery.utils.log import get_task_logger
 import os
 import re
 import sys
 import datetime
 import fileinput
 import os
 import re
 import shutil
 import smtplib
 import sys
 from pathlib import Path
 from os.path import basename, dirname, isfile
 from email.feedparser import FeedParser
 from email.message import EmailMessage
@ -75,7 +78,6 @@ def process_attachment(from_address, subject, attachment):
    print(f'Using pdf name: {pdf_name}')
    pdf_base = Path(pdf_name).stem
    workdir = os.path.join(WORKDIR, clean_path(from_address), pdf_base)
    ensure_dir(workdir)
    pdf_path = os.path.join(workdir, pdf_name)
@ -84,6 +86,32 @@ def process_attachment(from_address, subject, attachment):
        att.write(attachment.get_payload(decode=True))
    set_file_perms(pdf_path)
    process_pdf(pdf_path, from_address, subject, workdir)
@shared_task(on_failure=on_fail_handler)
 def process_markup_pdf(pdf_path, user):
    if not Path(pdf_path).is_file():
        print(f'No pdf - exiting ({pdf_path})')
        return
    pdf_stem = Path(pdf_path).stem
    workdir = os.path.join(WORKDIR, clean_path(user.username), clean_path(pdf_stem))
    ensure_dir(workdir)
    pdf_name = Path(pdf_path).name
    dest_path = os.path.join(workdir, pdf_name)
    print(f'copying pdf to {dest_path}')
    shutil.copy(pdf_path, dest_path)
    set_file_perms(dest_path)
    frm = str(make_header(decode_header(f'{user.get_full_name()} <{user.email}>')))
    subject = str(make_header(decode_header(pdf_name)))
    process_pdf(dest_path, frm, subject, workdir)
 def process_pdf(pdf_path, from_address, subject, workdir):
    # find matches
    matches = find_marked_products(pdf_path, workdir, debug=0)
    if not matches:
@ -94,7 +122,8 @@ def process_attachment(from_address, subject, attachment):
    print(f'{len(matches)} product matches')
    # write spreadsheet
-    xls_path = write_spreadsheet(matches, workdir, pdf_base)
+    pdf_stem = Path(pdf_path).stem
    xls_path = write_spreadsheet(matches, workdir, pdf_stem)
    if xls_path:
        # send reply