Examples with BigFileSearcher - org.riversun.bigdoc.bin.BigFileSearcher

Example 1 with BigFileSearcher

use of org.riversun.bigdoc.bin.BigFileSearcher in project mustangproject by ZUGFeRD.

the class ZUGFeRDValidator method validate.

/**
 * performs a validation on the file filename
 *
 * @param filename the complete absolute filename of a PDF or XML
 * @return a xml string with the validation result
 */
public String validate(String filename) {
    boolean xmlValidity;
    context.clear();
    StringBuffer finalStringResult = new StringBuffer();
    SimpleDateFormat isoDF = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    Date date = new Date();
    startTime = Calendar.getInstance().getTimeInMillis();
    try {
        Path path = Paths.get(filename);
        // set filename without path
        context.setFilename(path.getFileName().toString());
    } catch (NullPointerException ex) {
    // ignore
    }
    finalStringResult.append("<validation filename='" + context.getFilename() + "' datetime='" + isoDF.format(date) + "'>");
    boolean isPDF = false;
    try {
        if (filename == null) {
            optionsRecognized = false;
            context.addResultItem(new ValidationResultItem(ESeverity.fatal, "Filename not specified").setSection(10).setPart(EPart.pdf));
        }
        PDFValidator pdfv = new PDFValidator(context);
        File file = new File(filename);
        if (!file.exists()) {
            context.addResultItem(new ValidationResultItem(ESeverity.fatal, "File not found").setSection(1).setPart(EPart.pdf));
        } else if (file.length() < 32) {
            // with less then 32 bytes it can not even be a proper XML file
            context.addResultItem(new ValidationResultItem(ESeverity.fatal, "File too small").setSection(5).setPart(EPart.pdf));
        } else {
            BigFileSearcher searcher = new BigFileSearcher();
            XMLValidator xv = new XMLValidator(context);
            if (disableNotices) {
                xv.disableNotices();
            }
            byte[] pdfSignature = { '%', 'P', 'D', 'F' };
            isPDF = searcher.indexOf(file, pdfSignature) == 0;
            if (isPDF) {
                pdfv.setFilename(filename);
                optionsRecognized = true;
                try {
                    if (!file.exists()) {
                        context.addResultItem(new ValidationResultItem(ESeverity.exception, "File " + filename + " not found").setSection(1));
                    }
                } catch (IrrecoverableValidationError irx) {
                // @todo log
                }
                finalStringResult.append("<pdf>");
                optionsRecognized = true;
                try {
                    pdfv.validate();
                    sha1Checksum = calcSHA1(file);
                    // Validate PDF
                    finalStringResult.append(pdfv.getXMLResult());
                    pdfValidity = context.isValid();
                    Signature = context.getSignature();
                    // clear sets valid to true again
                    context.clear();
                    if (pdfv.getRawXML() != null) {
                        xv.setStringContent(pdfv.getRawXML());
                        displayXMLValidationOutput = true;
                    } else {
                        context.addResultItem(new ValidationResultItem(ESeverity.exception, "XML could not be extracted").setSection(17));
                    }
                } catch (IrrecoverableValidationError irx) {
                // @todo log
                }
                finalStringResult.append("</pdf>\n");
                context.clearCustomXML();
            } else {
                boolean isXML = false;
                try {
                    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
                    DocumentBuilder db = dbf.newDocumentBuilder();
                    byte[] content = Files.readAllBytes(file.toPath());
                    content = XMLTools.removeBOM(content);
                    String s = new String(content, StandardCharsets.UTF_8);
                    InputSource is = new InputSource(new StringReader(s));
                    Document doc = db.parse(is);
                    Element root = doc.getDocumentElement();
                    // no exception so far
                    isXML = true;
                } catch (Exception ex) {
                // probably no xml file, sth like SAXParseException content not allowed in prolog
                // ignore isXML is already false
                // in the tests, this may error-out anyway
                // ex.printStackTrace();
                }
                if (isXML) {
                    pdfValidity = true;
                    optionsRecognized = true;
                    xv.setFilename(filename);
                    if (file.exists()) {
                        sha1Checksum = calcSHA1(file);
                    }
                    displayXMLValidationOutput = true;
                } else {
                    optionsRecognized = false;
                    context.addResultItem(new ValidationResultItem(ESeverity.exception, "File does not look like PDF nor XML (contains neither %PDF nor <?xml)").setSection(8));
                }
            }
            if ((optionsRecognized) && (displayXMLValidationOutput)) {
                finalStringResult.append("<xml>");
                try {
                    xv.validate();
                } catch (IrrecoverableValidationError irx) {
                // @todo log
                }
                finalStringResult.append(xv.getXMLResult());
                finalStringResult.append("</xml>");
                context.clearCustomXML();
            }
            if ((isPDF) && (!pdfValidity)) {
                context.setInvalid();
            }
        }
    } catch (IrrecoverableValidationError irx) {
    // @todo log
    } finally {
        finalStringResult.append(context.getXMLResult());
        finalStringResult.append("</validation>");
    }
    OutputFormat format = OutputFormat.createPrettyPrint();
    StringWriter sw = new StringWriter();
    org.dom4j.Document document = null;
    try {
        document = DocumentHelper.parseText(new String(finalStringResult));
    } catch (DocumentException e1) {
        LOGGER.error(e1.getMessage());
    }
    XMLWriter writer = new XMLWriter(sw, format);
    try {
        writer.write(document);
    } catch (Exception e) {
        LOGGER.error(e.getMessage());
    }
    xmlValidity = context.isValid();
    long duration = Calendar.getInstance().getTimeInMillis() - startTime;
    String toBeAppended = "";
    if (logAppend != null) {
        toBeAppended = logAppend;
    }
    String pdfResult = "invalid";
    if (!isPDF) {
        pdfResult = "absent";
    } else if (pdfValidity) {
        pdfResult = "valid";
    }
    LOGGER.info("Parsed PDF:" + pdfResult + " XML:" + (xmlValidity ? "valid" : "invalid") + " Signature:" + Signature + " Checksum:" + sha1Checksum + " Profile:" + context.getProfile() + " Version:" + context.getGeneration() + " Took:" + duration + "ms Errors:[" + context.getCSVResult() + "] " + toBeAppended);
    wasCompletelyValid = ((pdfValidity) && (xmlValidity));
    return sw.toString();
}

Also used : InputSource(org.xml.sax.InputSource) DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) Element(org.w3c.dom.Element) Document(org.w3c.dom.Document) XMLWriter(org.dom4j.io.XMLWriter) DocumentException(org.dom4j.DocumentException) BigFileSearcher(org.riversun.bigdoc.bin.BigFileSearcher) Path(java.nio.file.Path) OutputFormat(org.dom4j.io.OutputFormat) Date(java.util.Date) DocumentException(org.dom4j.DocumentException) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) DocumentBuilder(javax.xml.parsers.DocumentBuilder) SimpleDateFormat(java.text.SimpleDateFormat)

Example 2 with BigFileSearcher

use of org.riversun.bigdoc.bin.BigFileSearcher in project mustangproject by ZUGFeRD.

the class PDFValidator method validate.

@Override
public void validate() throws IrrecoverableValidationError {
    zfXML = null;
    final File file = new File(pdfFilename);
    // file existence must have been checked before
    final BigFileSearcher searcher = new BigFileSearcher();
    final byte[] pdfSignature = { '%', 'P', 'D', 'F' };
    if (searcher.indexOf(file, pdfSignature) != 0) {
        context.addResultItem(new ValidationResultItem(ESeverity.fatal, "Not a PDF file " + pdfFilename).setSection(20).setPart(EPart.pdf));
    }
    final long startPDFTime = Calendar.getInstance().getTimeInMillis();
    // Step 1 Validate PDF
    VeraGreenfieldFoundryProvider.initialise();
    // Default validator config
    final ValidatorConfig validatorConfig = ValidatorFactory.defaultConfig();
    // Default features config
    final FeatureExtractorConfig featureConfig = FeatureFactory.defaultConfig();
    // Default plugins config
    final PluginsCollectionConfig pluginsConfig = PluginsCollectionConfig.defaultConfig();
    // Default fixer config
    final MetadataFixerConfig fixerConfig = FixerFactory.defaultConfig();
    // Tasks configuring
    final EnumSet tasks = EnumSet.noneOf(TaskType.class);
    tasks.add(TaskType.VALIDATE);
    // tasks.add(TaskType.EXTRACT_FEATURES);
    // tasks.add(TaskType.FIX_METADATA);
    // Creating processor config
    final ProcessorConfig processorConfig = ProcessorFactory.fromValues(validatorConfig, featureConfig, pluginsConfig, fixerConfig, tasks);
    // Creating processor and output stream.
    final ByteArrayOutputStream reportStream = new ByteArrayOutputStream();
    try (BatchProcessor processor = ProcessorFactory.fileBatchProcessor(processorConfig)) {
        // Generating list of files for processing
        final List<File> files = new ArrayList<>();
        files.add(new File(pdfFilename));
        // starting the processor
        processor.process(files, ProcessorFactory.getHandler(FormatOption.MRR, true, reportStream, 100, processorConfig.getValidatorConfig().isRecordPasses()));
        pdfReport = reportStream.toString("utf-8").replaceAll("<\\?xml version=\"1\\.0\" encoding=\"utf-8\"\\?>", "");
    } catch (final VeraPDFException e) {
        final ValidationResultItem vri = new ValidationResultItem(ESeverity.exception, e.getMessage()).setSection(6).setPart(EPart.pdf);
        final StringWriter sw = new StringWriter();
        final PrintWriter pw = new PrintWriter(sw);
        e.printStackTrace(pw);
        vri.setStacktrace(sw.toString());
        context.addResultItem(vri);
    } catch (final IOException excep) {
        context.addResultItem(new ValidationResultItem(ESeverity.exception, excep.getMessage()).setSection(7).setPart(EPart.pdf).setStacktrace(excep.getStackTrace().toString()));
    }
    // step 2 validate XMP
    final ZUGFeRDImporter zi = new ZUGFeRDImporter(pdfFilename);
    final String xmp = zi.getXMP();
    final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    final Document docXMP;
    if (xmp.length() == 0) {
        context.addResultItem(new ValidationResultItem(ESeverity.error, "Invalid XMP Metadata not found").setSection(17).setPart(EPart.pdf));
    }
    /*
		 * checking for sth like <zf:ConformanceLevel>EXTENDED</zf:ConformanceLevel>
		 * <zf:DocumentType>INVOICE</zf:DocumentType>
		 * <zf:DocumentFileName>ZUGFeRD-invoice.xml</zf:DocumentFileName>
		 * <zf:Version>1.0</zf:Version>
		 */
    try {
        final DocumentBuilder builder = factory.newDocumentBuilder();
        final InputSource is = new InputSource(new StringReader(xmp));
        docXMP = builder.parse(is);
        final XPathFactory xpathFactory = XPathFactory.newInstance();
        // Create XPath object XPath xpath = xpathFactory.newXPath(); XPathExpression
        final XPath xpath = xpathFactory.newXPath();
        // xpath.compile("//*[local-name()=\"GuidelineSpecifiedDocumentContextParameter\"]/[local-name()=\"ID\"]");
        // evaluate expression result on XML document ndList = (NodeList)
        // get the first element
        XPathExpression xpr = xpath.compile("//*[local-name()=\"ConformanceLevel\"]|//*[local-name()=\"Description\"]/@ConformanceLevel");
        NodeList nodes = (NodeList) xpr.evaluate(docXMP, XPathConstants.NODESET);
        if (nodes.getLength() == 0) {
            context.addResultItem(new ValidationResultItem(ESeverity.error, "XMP Metadata: ConformanceLevel not found").setSection(11).setPart(EPart.pdf));
        }
        boolean conformanceLevelValid = false;
        for (int i = 0; i < nodes.getLength(); i++) {
            final String[] valueArray = { "BASIC WL", "BASIC", "MINIMUM", "EN 16931", "COMFORT", "CIUS", "EXTENDED", "XRECHNUNG" };
            if (stringArrayContains(valueArray, nodes.item(i).getTextContent())) {
                conformanceLevelValid = true;
            }
        }
        if (!conformanceLevelValid) {
            context.addResultItem(new ValidationResultItem(ESeverity.error, "XMP Metadata: ConformanceLevel contains invalid value").setSection(12).setPart(EPart.pdf));
        }
        xpr = xpath.compile("//*[local-name()=\"DocumentType\"]|//*[local-name()=\"Description\"]/@DocumentType");
        nodes = (NodeList) xpr.evaluate(docXMP, XPathConstants.NODESET);
        if (nodes.getLength() == 0) {
            context.addResultItem(new ValidationResultItem(ESeverity.error, "XMP Metadata: DocumentType not found").setSection(13).setPart(EPart.pdf));
        }
        boolean documentTypeValid = false;
        for (int i = 0; i < nodes.getLength(); i++) {
            if (nodes.item(i).getTextContent().equals("INVOICE") || nodes.item(i).getTextContent().equals("ORDER") || nodes.item(i).getTextContent().equals("ORDER_RESPONSE") || nodes.item(i).getTextContent().equals("ORDER_CHANGE")) {
                documentTypeValid = true;
            }
        }
        if (!documentTypeValid) {
            context.addResultItem(new ValidationResultItem(ESeverity.error, "XMP Metadata: DocumentType invalid").setSection(14).setPart(EPart.pdf));
        }
        xpr = xpath.compile("//*[local-name()=\"DocumentFileName\"]|//*[local-name()=\"Description\"]/@DocumentFileName");
        nodes = (NodeList) xpr.evaluate(docXMP, XPathConstants.NODESET);
        if (nodes.getLength() == 0) {
            context.addResultItem(new ValidationResultItem(ESeverity.error, "XMP Metadata: DocumentFileName not found").setSection(21).setPart(EPart.pdf));
        }
        boolean documentFilenameValid = false;
        for (int i = 0; i < nodes.getLength(); i++) {
            final String[] valueArray = { "factur-x.xml", "ZUGFeRD-invoice.xml", "zugferd-invoice.xml", "xrechnung.xml", "order-x.xml" };
            if (stringArrayContains(valueArray, nodes.item(i).getTextContent())) {
                documentFilenameValid = true;
            }
        // e.g. ZUGFeRD-invoice.xml
        }
        if (!documentFilenameValid) {
            context.addResultItem(new ValidationResultItem(ESeverity.error, "XMP Metadata: DocumentFileName contains invalid value").setSection(19).setPart(EPart.pdf));
        }
        xpr = xpath.compile("//*[local-name()=\"Version\"]|//*[local-name()=\"Description\"]/@Version");
        nodes = (NodeList) xpr.evaluate(docXMP, XPathConstants.NODESET);
        // print the text content of each child
        if (nodes.getLength() == 0) {
            context.addResultItem(new ValidationResultItem(ESeverity.error, "XMP Metadata: Version not found").setSection(15).setPart(EPart.pdf));
        }
        boolean versionValid = false;
        for (int i = 0; i < nodes.getLength(); i++) {
            // 1.2, 2.0 and 2.1 are for xrechnung 1.2, 2p0 can be ZF 2.0, 2.1, 2.1.1
            final String[] valueArray = { "1.0", "2p0", "1.2", "2.0", "2.1" };
            if (stringArrayContains(valueArray, nodes.item(i).getTextContent())) {
                versionValid = true;
            }
        // e.g. 1.0
        }
        if (!versionValid) {
            context.addResultItem(new ValidationResultItem(ESeverity.error, "XMP Metadata: Version contains invalid value").setSection(16).setPart(EPart.pdf));
        }
    } catch (final SAXException e) {
        LOGGER.error(e.getMessage(), e);
    } catch (final IOException e) {
        LOGGER.error(e.getMessage(), e);
    } catch (final ParserConfigurationException e) {
        LOGGER.error(e.getMessage(), e);
    } catch (final XPathExpressionException e) {
        LOGGER.error(e.getMessage(), e);
    }
    zfXML = zi.getUTF8();
    // step 3 find signatures
    try {
        final byte[] symtraxSignature = "Symtrax".getBytes("UTF-8");
        final byte[] mustangSignature = "via mustangproject".getBytes("UTF-8");
        final byte[] facturxpythonSignature = "by Alexis de Lattre".getBytes("UTF-8");
        final byte[] intarsysSignature = "intarsys ".getBytes("UTF-8");
        final byte[] konikSignature = "Konik".getBytes("UTF-8");
        final byte[] pdfMachineSignature = "pdfMachine from Broadgun Software".getBytes("UTF-8");
        final byte[] ghostscriptSignature = "%%Invocation:".getBytes("UTF-8");
        if (searcher.indexOf(file, symtraxSignature) != -1) {
            Signature = "Symtrax";
        } else if (searcher.indexOf(file, mustangSignature) != -1) {
            Signature = "Mustang";
        } else if (searcher.indexOf(file, facturxpythonSignature) != -1) {
            Signature = "Factur/X Python";
        } else if (searcher.indexOf(file, intarsysSignature) != -1) {
            Signature = "Intarsys";
        } else if (searcher.indexOf(file, konikSignature) != -1) {
            Signature = "Konik";
        } else if (searcher.indexOf(file, pdfMachineSignature) != -1) {
            Signature = "pdfMachine";
        } else if (searcher.indexOf(file, ghostscriptSignature) != -1) {
            Signature = "Ghostscript";
        }
        context.setSignature(Signature);
    } catch (final UnsupportedEncodingException e) {
        LOGGER.error(e.getMessage(), e);
    }
    // step 4:validate additional data
    final HashMap<String, byte[]> additionalData = zi.getAdditionalData();
    for (final String filename : additionalData.keySet()) {
        // validating xml in byte[]	additionalData.get(filename)
        LOGGER.info("validating additionalData " + filename);
        validateSchema(additionalData.get(filename), "ad/basic/additional_data_base_schema.xsd", 2, EPart.pdf);
    }
    // end
    final long endTime = Calendar.getInstance().getTimeInMillis();
    if (!pdfReport.contains("validationReports compliant=\"1\"")) {
        context.setInvalid();
    }
    if (!pdfReport.contains("PDF/A-3")) {
        context.addResultItem(new ValidationResultItem(ESeverity.error, "Not a PDF/A-3").setSection(23).setPart(EPart.pdf));
    }
    context.addCustomXML(pdfReport + "<info><signature>" + ((context.getSignature() != null) ? context.getSignature() : "unknown") + "</signature><duration unit=\"ms\">" + (endTime - startPDFTime) + "</duration></info>");
}

Also used : XPathExpression(javax.xml.xpath.XPathExpression) InputSource(org.xml.sax.InputSource) DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) XPathExpressionException(javax.xml.xpath.XPathExpressionException) ArrayList(java.util.ArrayList) Document(org.w3c.dom.Document) ValidatorConfig(org.verapdf.pdfa.validation.validators.ValidatorConfig) ProcessorConfig(org.verapdf.processor.ProcessorConfig) VeraPDFException(org.verapdf.core.VeraPDFException) SAXException(org.xml.sax.SAXException) FeatureExtractorConfig(org.verapdf.features.FeatureExtractorConfig) XPathFactory(javax.xml.xpath.XPathFactory) BatchProcessor(org.verapdf.processor.BatchProcessor) StringWriter(java.io.StringWriter) StringReader(java.io.StringReader) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) BigFileSearcher(org.riversun.bigdoc.bin.BigFileSearcher) PrintWriter(java.io.PrintWriter) XPath(javax.xml.xpath.XPath) ZUGFeRDImporter(org.mustangproject.ZUGFeRD.ZUGFeRDImporter) MetadataFixerConfig(org.verapdf.metadata.fixer.MetadataFixerConfig) EnumSet(java.util.EnumSet) NodeList(org.w3c.dom.NodeList) UnsupportedEncodingException(java.io.UnsupportedEncodingException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException) PluginsCollectionConfig(org.verapdf.processor.plugins.PluginsCollectionConfig) DocumentBuilder(javax.xml.parsers.DocumentBuilder) File(java.io.File)

Aggregations

DocumentBuilder (javax.xml.parsers.DocumentBuilder)2 DocumentBuilderFactory (javax.xml.parsers.DocumentBuilderFactory)2 BigFileSearcher (org.riversun.bigdoc.bin.BigFileSearcher)2 Document (org.w3c.dom.Document)2 InputSource (org.xml.sax.InputSource)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 File (java.io.File)1 IOException (java.io.IOException)1 PrintWriter (java.io.PrintWriter)1 StringReader (java.io.StringReader)1 StringWriter (java.io.StringWriter)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 Path (java.nio.file.Path)1 NoSuchAlgorithmException (java.security.NoSuchAlgorithmException)1 SimpleDateFormat (java.text.SimpleDateFormat)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 EnumSet (java.util.EnumSet)1 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)1 XPath (javax.xml.xpath.XPath)1