package com.appian.documentunderstanding.tabula;

import com.appian.documentunderstanding.common.DocumentUnderstandingContentServiceAdapter;
import com.appian.documentunderstanding.exception.DocExtractionGenericException;
import com.appian.documentunderstanding.exception.InvalidAppianDocumentException;
import com.appian.documentunderstanding.exception.InvalidDocumentAttributesException;
import com.appian.documentunderstanding.exception.PermissionsException;
import com.appian.documentunderstanding.function.OcrJobContext;
import com.appian.documentunderstanding.pdf.PdfTextDetector;
import com.appian.documentunderstanding.populate.InterpretedPage;
import com.appian.documentunderstanding.populate.InterpretedPoint;
import com.appian.documentunderstanding.populate.OcrResult;
import com.appian.documentunderstanding.prediction.metrics.DocExtractPredictionMetricsCollector;
import com.appiancorp.common.monitoring.Stopwatch;
import com.appiancorp.type.cdt.value.ReconcileAnnotation;
import com.appiancorp.type.cdt.value.ReconcileCoordinates;
import com.google.common.collect.ImmutableList;
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.log4j.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import technology.tabula.ObjectExtractor;
import technology.tabula.Page;
import technology.tabula.PageIterator;
import technology.tabula.Table;
import technology.tabula.extractors.ExtractionAlgorithm;

/* loaded from: input_file:com/appian/documentunderstanding/tabula/TabulaExtractor.class */
public class TabulaExtractor {
    private static final Logger LOG = Logger.getLogger(TabulaExtractor.class);
    private final DocumentUnderstandingContentServiceAdapter contentServiceAdapter;
    private final ExtractionAlgorithm algorithm;
    private final DocExtractPredictionMetricsCollector predictionMetricsCollector;
    private final TextSanitizer textSanitizer = new TextSanitizer();

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:com/appian/documentunderstanding/tabula/TabulaExtractor$SearchableTextTransformation.class */
    public static class SearchableTextTransformation {
        private final float fontSize;
        private final float newlineXOffset;
        private final float newlineYOffset;
        private final String text;

        SearchableTextTransformation(float f, float f2, float f3, String str) {
            this.fontSize = f;
            this.newlineXOffset = f2;
            this.newlineYOffset = f3;
            this.text = str;
        }

        public float getFontSize() {
            return this.fontSize;
        }

        public float getNewlineXOffset() {
            return this.newlineXOffset;
        }

        public float getNewlineYOffset() {
            return this.newlineYOffset;
        }

        public String getText() {
            return this.text;
        }
    }

    public TabulaExtractor(DocumentUnderstandingContentServiceAdapter documentUnderstandingContentServiceAdapter, ExtractionAlgorithm extractionAlgorithm, DocExtractPredictionMetricsCollector docExtractPredictionMetricsCollector) {
        this.contentServiceAdapter = documentUnderstandingContentServiceAdapter;
        this.algorithm = extractionAlgorithm;
        this.predictionMetricsCollector = docExtractPredictionMetricsCollector;
    }

    public Table extractTable(Integer num, Integer num2, ReconcileAnnotation reconcileAnnotation, OcrResult ocrResult) throws DocExtractionGenericException {
        try {
            PageIterator loadDocumentPages = loadDocumentPages(Long.valueOf(num.longValue()), num2, ocrResult);
            Stopwatch stopwatch = new Stopwatch();
            try {
                if (!loadDocumentPages.hasNext()) {
                    throw new DocExtractionGenericException(String.format("No Tabula pages loaded for document %d", num));
                }
                Page next = loadDocumentPages.next();
                InterpretedPage interpretedPage = ocrResult.getPages().get(num2);
                double pageHeight = interpretedPage.getPageHeight();
                double pageWidth = interpretedPage.getPageWidth();
                ReconcileCoordinates topLeft = reconcileAnnotation.getTopLeft();
                ReconcileCoordinates bottomRight = reconcileAnnotation.getBottomRight();
                List extract = this.algorithm.extract(next.getArea((float) (topLeft.getyValue() * pageHeight), (float) (topLeft.getxValue() * pageWidth), (float) (bottomRight.getyValue() * pageHeight), (float) (bottomRight.getxValue() * pageWidth)));
                if (extract.size() == 0) {
                    throw new DocExtractionGenericException(String.format("No Tabula tables extracted for document %d on page %d", num, num2));
                }
                if (extract.size() > 1) {
                    throw new DocExtractionGenericException(String.format("Multiple Tabula tables extracted for document %d on page %d", num, num2));
                }
                Table table = (Table) extract.get(0);
                this.predictionMetricsCollector.recordTabulaExtractionDuration(Long.valueOf(stopwatch.measureMillis()));
                return table;
            } catch (Throwable th) {
                this.predictionMetricsCollector.recordTabulaExtractionDuration(Long.valueOf(stopwatch.measureMillis()));
                throw th;
            }
        } catch (InvalidAppianDocumentException | InvalidDocumentAttributesException | PermissionsException | IOException e) {
            throw new DocExtractionGenericException(String.format("Error loading PDF document %d", num), (Throwable) e);
        }
    }

    private PDDocument addPdfOcrText(PDDocument pDDocument, Integer num, OcrResult ocrResult) throws IOException {
        PDPage page = pDDocument.getPage(num.intValue() - 1);
        if (!isPdfSearchable(pDDocument, num)) {
            Stopwatch stopwatch = new Stopwatch();
            insertSearchableText(pDDocument, page, generateSearchableTextTransformations(pDDocument, page, ocrResult.getPages().get(num)));
            this.predictionMetricsCollector.recordTabulaTextTransformationDuration(Long.valueOf(stopwatch.measureMillis()));
        }
        return pDDocument;
    }

    List<SearchableTextTransformation> generateSearchableTextTransformations(PDDocument pDDocument, PDPage pDPage, InterpretedPage interpretedPage) throws IOException {
        PDType0Font load = PDType0Font.load(pDDocument, getClass().getResourceAsStream("NotoMono-Regular.ttf"));
        double pageWidth = interpretedPage.getPageWidth();
        double pageHeight = interpretedPage.getPageHeight();
        double height = pDPage.getMediaBox().getHeight();
        return (List) interpretedPage.getLines().stream().map(interpretedLine -> {
            List<InterpretedPoint> annotation = interpretedLine.getAnnotation();
            Double d = (Double) annotation.stream().map((v0) -> {
                return v0.getX();
            }).min(Comparator.naturalOrder()).orElse(Double.valueOf(OcrJobContext.DEFAULT_CONFIDENCE_THRESHOLD));
            Double d2 = (Double) annotation.stream().map((v0) -> {
                return v0.getX();
            }).max(Comparator.naturalOrder()).orElse(Double.valueOf(Double.MAX_VALUE));
            Double d3 = (Double) annotation.stream().map((v0) -> {
                return v0.getY();
            }).max(Comparator.naturalOrder()).orElse(Double.valueOf(Double.MAX_VALUE));
            return new SearchableTextTransformation((float) (((((d2.doubleValue() - d.doubleValue()) * pageWidth) / r0.length()) / load.getAverageFontWidth()) * 1000.0d), (float) (d.doubleValue() * pageWidth), (float) (height - (d3.doubleValue() * pageHeight)), interpretedLine.getLineText());
        }).collect(Collectors.toList());
    }

    private void insertSearchableText(PDDocument pDDocument, PDPage pDPage, List<SearchableTextTransformation> list) throws IOException {
        PDType0Font load = PDType0Font.load(pDDocument, getClass().getResourceAsStream("NotoMono-Regular.ttf"));
        PDPageContentStream pDPageContentStream = new PDPageContentStream(pDDocument, pDPage);
        for (SearchableTextTransformation searchableTextTransformation : list) {
            pDPageContentStream.beginText();
            pDPageContentStream.setFont(load, searchableTextTransformation.getFontSize());
            pDPageContentStream.newLineAtOffset(searchableTextTransformation.getNewlineXOffset(), searchableTextTransformation.getNewlineYOffset());
            try {
                pDPageContentStream.showText(searchableTextTransformation.getText());
            } catch (Throwable th) {
                LOG.warn("Exception encountered when adding line text to document, falling back to replace unsupported characters", th);
                pDPageContentStream.showText(this.textSanitizer.sanitizeTextForFont(load, searchableTextTransformation.getText()));
            }
            pDPageContentStream.endText();
        }
        pDPageContentStream.close();
        pDDocument.save(new BufferedOutputStream(new ByteArrayOutputStream()));
    }

    PageIterator loadDocumentPages(Long l, Integer num, OcrResult ocrResult) throws InvalidAppianDocumentException, PermissionsException, InvalidDocumentAttributesException, IOException {
        return new PageIterator(new ObjectExtractor(addPdfOcrText(PDDocument.load(this.contentServiceAdapter.getDocumentInputStream(l)), num, ocrResult)), ImmutableList.of(num));
    }

    private boolean isPdfSearchable(PDDocument pDDocument, Integer num) throws IOException {
        return new PdfTextDetector().isTextPresent(pDDocument, num.intValue());
    }
}
