Merge branch 'markup_documents'
This commit is contained in:
@ -85,6 +85,10 @@ def send_error_email(subj, einfo):
|
|||||||
|
|
||||||
|
|
||||||
def send(frm, subj, msg):
|
def send(frm, subj, msg):
|
||||||
|
if not EMAIL_HOST:
|
||||||
|
log.info(f'not sending email')
|
||||||
|
return
|
||||||
|
|
||||||
msg['From'] = 'Keen ProCatalog Markup Bot <markup@procatalog.io>'
|
msg['From'] = 'Keen ProCatalog Markup Bot <markup@procatalog.io>'
|
||||||
msg['Reply-To'] = 'Keen ProCatalog Support <support@procatalog.io>'
|
msg['Reply-To'] = 'Keen ProCatalog Support <support@procatalog.io>'
|
||||||
msg['To'] = frm
|
msg['To'] = frm
|
||||||
|
|||||||
@ -9,6 +9,8 @@ import dumper
|
|||||||
import random as rng
|
import random as rng
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pdfminer.psparser import LIT
|
||||||
|
|
||||||
from .utils import cv2_rect, ensure_dir, set_file_perms, WORKDIR
|
from .utils import cv2_rect, ensure_dir, set_file_perms, WORKDIR
|
||||||
|
|
||||||
# https://www.pyimagesearch.com/2014/10/20/finding-shapes-images-using-python-opencv/
|
# https://www.pyimagesearch.com/2014/10/20/finding-shapes-images-using-python-opencv/
|
||||||
@ -110,3 +112,43 @@ def write_debug_image(workdir, page_num, prods, scribbles):
|
|||||||
|
|
||||||
img.save(path)
|
img.save(path)
|
||||||
set_file_perms(path)
|
set_file_perms(path)
|
||||||
|
|
||||||
|
|
||||||
|
def write_inklist(obj, mediabox, path):
|
||||||
|
"""Draw an image of the inklist."""
|
||||||
|
pagew = mediabox[2] - mediabox[0]
|
||||||
|
pageh = mediabox[3] - mediabox[1]
|
||||||
|
|
||||||
|
img = Image.new('RGBA', (pagew, pageh), (0, 0, 0, 0))
|
||||||
|
draw = ImageDraw.Draw(img, 'RGBA')
|
||||||
|
|
||||||
|
for segment in obj['InkList']:
|
||||||
|
draw.line(segment, 'black', 3)
|
||||||
|
|
||||||
|
# account for the difference in coordinate systems
|
||||||
|
# between pdf and images.
|
||||||
|
img = img.transpose(Image.FLIP_TOP_BOTTOM)
|
||||||
|
|
||||||
|
img.save(path)
|
||||||
|
set_file_perms(path)
|
||||||
|
|
||||||
|
|
||||||
|
def write_square_or_circle(obj, mediabox, path):
|
||||||
|
"""Draw an image of the inklist."""
|
||||||
|
pagew = mediabox[2] - mediabox[0]
|
||||||
|
pageh = mediabox[3] - mediabox[1]
|
||||||
|
|
||||||
|
img = Image.new('RGBA', (pagew, pageh), (0, 0, 0, 0))
|
||||||
|
draw = ImageDraw.Draw(img, 'RGBA')
|
||||||
|
|
||||||
|
if obj["Subtype"] == LIT('Square'):
|
||||||
|
draw.rectangle(obj['Rect'], fill=None, outline='black', width=3)
|
||||||
|
else:
|
||||||
|
draw.ellipse(*obj['Rect'], fill=None, outline='black', width=3)
|
||||||
|
|
||||||
|
# account for the difference in coordinate systems
|
||||||
|
# between pdf and images.
|
||||||
|
img = img.transpose(Image.FLIP_TOP_BOTTOM)
|
||||||
|
|
||||||
|
img.save(path)
|
||||||
|
set_file_perms(path)
|
||||||
|
|||||||
@ -9,7 +9,8 @@ from pdfminer.pdfparser import PDFParser
|
|||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
from pdfminer.pdftypes import PDFObjRef, resolve1
|
from pdfminer.pdftypes import PDFObjRef, resolve1
|
||||||
|
|
||||||
from .utils import pdf_rect, ensure_dir, set_file_perms
|
from .utils import Rect, pdf_rect, ensure_dir, set_file_perms
|
||||||
|
from .img import write_inklist, write_square_or_circle
|
||||||
|
|
||||||
|
|
||||||
def make_product_box(obj, pagenum, mediabox):
|
def make_product_box(obj, pagenum, mediabox):
|
||||||
@ -39,7 +40,31 @@ def make_product_box(obj, pagenum, mediabox):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def make_scribble(obj, pagenum, mediabox, workdir):
|
def make_ink_scribble(obj, pagenum, mediabox, workdir):
|
||||||
|
oid = obj['NM'].decode('utf-8')
|
||||||
|
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png")
|
||||||
|
|
||||||
|
write_inklist(obj, mediabox, png_path)
|
||||||
|
|
||||||
|
return { 'page': pagenum,
|
||||||
|
'rect': Rect(*mediabox),
|
||||||
|
'objid': oid,
|
||||||
|
'image': png_path }
|
||||||
|
|
||||||
|
|
||||||
|
def make_square_or_circle_scribble(obj, pagenum, mediabox, workdir):
|
||||||
|
oid = obj['NM'].decode('utf-8')
|
||||||
|
png_path = os.path.join(workdir, f"export-page{pagenum:03d}-nm{oid}.png")
|
||||||
|
|
||||||
|
write_square_or_circle(obj, mediabox, png_path)
|
||||||
|
|
||||||
|
return { 'page': pagenum,
|
||||||
|
'rect': Rect(*mediabox),
|
||||||
|
'objid': oid,
|
||||||
|
'image': png_path }
|
||||||
|
|
||||||
|
|
||||||
|
def make_aapl_scribble(obj, pagenum, mediabox, workdir):
|
||||||
rect = obj['Rect'] # position on page
|
rect = obj['Rect'] # position on page
|
||||||
|
|
||||||
# walk the object tree down to the image
|
# walk the object tree down to the image
|
||||||
@ -143,6 +168,17 @@ def write_pbm(obj, base_path):
|
|||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def is_inklist_annotation(anno):
|
||||||
|
return 'Subtype' in anno and anno["Subtype"] == LIT('Ink')
|
||||||
|
|
||||||
|
|
||||||
|
def is_square_or_circle_annotation(anno):
|
||||||
|
if 'Subtype' in anno:
|
||||||
|
if anno["Subtype"] == LIT('Square') or anno["Subtype"] == LIT('Circle'):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def parse_pdf(fname, workdir, debug=0):
|
def parse_pdf(fname, workdir, debug=0):
|
||||||
PDFDocument.debug = debug
|
PDFDocument.debug = debug
|
||||||
PDFParser.debug = debug
|
PDFParser.debug = debug
|
||||||
@ -173,10 +209,18 @@ def parse_pdf(fname, workdir, debug=0):
|
|||||||
|
|
||||||
for anno in annots:
|
for anno in annots:
|
||||||
anno = resolve1(anno)
|
anno = resolve1(anno)
|
||||||
if 'AAPL:AKExtras' in anno:
|
if is_inklist_annotation(anno):
|
||||||
scribbles.append(make_scribble(anno, pagenum, mediabox, workdir))
|
scribbles.append(make_ink_scribble(anno, pagenum, mediabox, workdir))
|
||||||
|
elif is_square_or_circle_annotation(anno):
|
||||||
|
scribbles.append(make_square_or_circle_scribble(anno, pagenum, mediabox, workdir))
|
||||||
|
elif 'AAPL:AKExtras' in anno:
|
||||||
|
scribbles.append(make_aapl_scribble(anno, pagenum, mediabox, workdir))
|
||||||
elif 'ProCatName' in anno:
|
elif 'ProCatName' in anno:
|
||||||
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
|
prod_boxes.append(make_product_box(anno, pagenum, mediabox))
|
||||||
|
elif anno['Subtype'] == LIT('FreeText'):
|
||||||
|
print('ignoring FreeText annotation')
|
||||||
|
elif anno['Subtype'] == LIT('Highlight'):
|
||||||
|
print('ignoring Highlight annotation')
|
||||||
else:
|
else:
|
||||||
print('ignoring other annotation:')
|
print('ignoring other annotation:')
|
||||||
print(anno)
|
print(anno)
|
||||||
|
|||||||
@ -2,13 +2,16 @@ from __future__ import absolute_import, unicode_literals
|
|||||||
from celery import task, shared_task
|
from celery import task, shared_task
|
||||||
from celery.utils.log import get_task_logger
|
from celery.utils.log import get_task_logger
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import datetime
|
import datetime
|
||||||
import fileinput
|
import fileinput
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
import smtplib
|
import smtplib
|
||||||
|
import sys
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from os.path import basename, dirname, isfile
|
||||||
|
|
||||||
from email.feedparser import FeedParser
|
from email.feedparser import FeedParser
|
||||||
from email.message import EmailMessage
|
from email.message import EmailMessage
|
||||||
@ -75,7 +78,6 @@ def process_attachment(from_address, subject, attachment):
|
|||||||
print(f'Using pdf name: {pdf_name}')
|
print(f'Using pdf name: {pdf_name}')
|
||||||
|
|
||||||
pdf_base = Path(pdf_name).stem
|
pdf_base = Path(pdf_name).stem
|
||||||
|
|
||||||
workdir = os.path.join(WORKDIR, clean_path(from_address), pdf_base)
|
workdir = os.path.join(WORKDIR, clean_path(from_address), pdf_base)
|
||||||
ensure_dir(workdir)
|
ensure_dir(workdir)
|
||||||
pdf_path = os.path.join(workdir, pdf_name)
|
pdf_path = os.path.join(workdir, pdf_name)
|
||||||
@ -84,6 +86,32 @@ def process_attachment(from_address, subject, attachment):
|
|||||||
att.write(attachment.get_payload(decode=True))
|
att.write(attachment.get_payload(decode=True))
|
||||||
set_file_perms(pdf_path)
|
set_file_perms(pdf_path)
|
||||||
|
|
||||||
|
process_pdf(pdf_path, from_address, subject, workdir)
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task(on_failure=on_fail_handler)
|
||||||
|
def process_markup_pdf(pdf_path, user):
|
||||||
|
if not Path(pdf_path).is_file():
|
||||||
|
print(f'No pdf - exiting ({pdf_path})')
|
||||||
|
return
|
||||||
|
|
||||||
|
pdf_stem = Path(pdf_path).stem
|
||||||
|
workdir = os.path.join(WORKDIR, clean_path(user.username), clean_path(pdf_stem))
|
||||||
|
ensure_dir(workdir)
|
||||||
|
|
||||||
|
pdf_name = Path(pdf_path).name
|
||||||
|
dest_path = os.path.join(workdir, pdf_name)
|
||||||
|
print(f'copying pdf to {dest_path}')
|
||||||
|
shutil.copy(pdf_path, dest_path)
|
||||||
|
set_file_perms(dest_path)
|
||||||
|
|
||||||
|
frm = str(make_header(decode_header(f'{user.get_full_name()} <{user.email}>')))
|
||||||
|
subject = str(make_header(decode_header(pdf_name)))
|
||||||
|
|
||||||
|
process_pdf(dest_path, frm, subject, workdir)
|
||||||
|
|
||||||
|
|
||||||
|
def process_pdf(pdf_path, from_address, subject, workdir):
|
||||||
# find matches
|
# find matches
|
||||||
matches = find_marked_products(pdf_path, workdir, debug=0)
|
matches = find_marked_products(pdf_path, workdir, debug=0)
|
||||||
if not matches:
|
if not matches:
|
||||||
@ -94,7 +122,8 @@ def process_attachment(from_address, subject, attachment):
|
|||||||
print(f'{len(matches)} product matches')
|
print(f'{len(matches)} product matches')
|
||||||
|
|
||||||
# write spreadsheet
|
# write spreadsheet
|
||||||
xls_path = write_spreadsheet(matches, workdir, pdf_base)
|
pdf_stem = Path(pdf_path).stem
|
||||||
|
xls_path = write_spreadsheet(matches, workdir, pdf_stem)
|
||||||
|
|
||||||
if xls_path:
|
if xls_path:
|
||||||
# send reply
|
# send reply
|
||||||
|
|||||||
Reference in New Issue
Block a user