a sEc @sLddlZddlZddlZddlZddlmZedejZ GdddZ dS)N)getExtractedDataFromS3z([A-Z]*[\.!?]$)c@s$eZdZddZddZddZdS)PatternDetectorc stjd}|d}d}|D]r\}}t|dfdd|D}|rLq |} | durhd|<n| d7} || i|d7}|dkr qq |jd d} | D]F\}}t|d|} | durd|<q| d7} || iq|S) N SKIP_PAGES,rTextcsg|]}|vr|qSr).0eletxtr~/Users/sathyaws/work/code/ml/opensource/amazon-textract-pdf-text-extractor/lambda/textractpostprocessor/app/patternDetector.py z5PatternDetector.parseHeaderFooter..)osenvironsplititerrowsstrgetupdateiloc) self dataFiltered occurrencerskipPageiindexrow elemFoundpairZdfLast10rr r parseHeaderFooter s2        z!PatternDetector.parseHeaderFootercCsDtt|dkr@t|d}|ds<|s<|dkr@dSdS)Nr TF)lenpatternIncompleteLinefindallrisdigitisupper)rlinelengthrrr regexHeaderOrFooter2s z#PatternDetector.regexHeaderOrFooterc CsRg}d}d}i}zd}|d|} t|| } | d} | | D]t}| | d|k} | jd} | dkrpq|| |}||ks|dkrF|rF|D] \}}|dks||krd}qFqqF|sq|d7}qWn8ty}ztd |t|WYd}~n d}~00i}|D]&\}}|dks0||kr|||<qd d |D|S) NrT/Pager Fzend of processing :cSsg|]\}}t|d|qS):)print)rkvrrr r frz?PatternDetector.identifyHeaderFooterPattern..)runiquesortshaper#items Exceptionr3)r bucketName prefixPath totalPagesZheaderFooterPatternrpagerZfullScanprefixrpagesdatatotalRowr4r5efilterdHeaderFooterrrr identifyHeaderFooterPattern9sB      z+PatternDetector.identifyHeaderFooterPatternN)__name__ __module__ __qualname__r#r-rErrrr r s%r) pandaspdboto3rerawsUtilsrcompileMr'rrrrr s