Search in sources :

Example 1 with AutoDetectReader

use of org.apache.tika.detect.AutoDetectReader in project tika by apache.

the class TXTParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        //try to get detected content type; could be a subclass of text/plain
        //such as vcal, etc.
        String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
        MediaType mediaType = MediaType.TEXT_PLAIN;
        if (incomingMime != null) {
            MediaType tmpMediaType = MediaType.parse(incomingMime);
            if (tmpMediaType != null) {
                mediaType = tmpMediaType;
            }
        }
        Charset charset = reader.getCharset();
        MediaType type = new MediaType(mediaType, charset);
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");
        char[] buffer = new char[4096];
        int n = reader.read(buffer);
        while (n != -1) {
            xhtml.characters(buffer, 0, n);
            n = reader.read(buffer);
        }
        xhtml.endElement("p");
        xhtml.endDocument();
    }
}
Also used : AutoDetectReader(org.apache.tika.detect.AutoDetectReader) MediaType(org.apache.tika.mime.MediaType) Charset(java.nio.charset.Charset) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 2 with AutoDetectReader

use of org.apache.tika.detect.AutoDetectReader in project tika by apache.

the class HtmlParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        String previous = metadata.get(Metadata.CONTENT_TYPE);
        MediaType contentType = null;
        if (previous == null || previous.startsWith("text/html")) {
            contentType = new MediaType(MediaType.TEXT_HTML, charset);
        } else if (previous.startsWith("application/xhtml+xml")) {
            contentType = new MediaType(XHTML, charset);
        } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
            contentType = new MediaType(WAP_XHTML, charset);
        } else if (previous.startsWith("application/x-asp")) {
            contentType = new MediaType(X_ASP, charset);
        }
        if (contentType != null) {
            metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
        }
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
        // Get the HTML mapper from the parse context
        HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());
        // Parse the HTML document
        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
        // Use schema from context or default
        Schema schema = context.get(Schema.class, HTML_SCHEMA);
        // TIKA-528: Reuse share schema to avoid heavy instantiation
        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
        // TIKA-599: Shared schema is thread-safe only if bogons are ignored
        parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
        parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));
        parser.parse(reader.asInputSource());
    }
}
Also used : HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Schema(org.ccil.cowan.tagsoup.Schema) Charset(java.nio.charset.Charset) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) MediaType(org.apache.tika.mime.MediaType) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 3 with AutoDetectReader

use of org.apache.tika.detect.AutoDetectReader in project tika by apache.

the class SourceCodeParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        String mediaType = metadata.get(Metadata.CONTENT_TYPE);
        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (mediaType != null && name != null) {
            MediaType type = MediaType.parse(mediaType);
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
            StringBuilder out = new StringBuilder();
            String line;
            int nbLines = 0;
            while ((line = reader.readLine()) != null) {
                out.append(line + System.getProperty("line.separator"));
                String author = parserAuthor(line);
                if (author != null) {
                    metadata.add(TikaCoreProperties.CREATOR, author);
                }
                nbLines++;
            }
            metadata.set("LoC", String.valueOf(nbLines));
            Renderer renderer = getRenderer(type.toString());
            String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
            Schema schema = context.get(Schema.class, HTML_SCHEMA);
            org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
            parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
            parser.setContentHandler(handler);
            parser.parse(new InputSource(new StringReader(codeAsHtml)));
        }
    }
}
Also used : InputSource(org.xml.sax.InputSource) HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Schema(org.ccil.cowan.tagsoup.Schema) Charset(java.nio.charset.Charset) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) Renderer(com.uwyn.jhighlight.renderer.Renderer) StringReader(java.io.StringReader) MediaType(org.apache.tika.mime.MediaType) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 4 with AutoDetectReader

use of org.apache.tika.detect.AutoDetectReader in project tika by apache.

the class EnviHeaderParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Only outputting the MIME type as metadata
    metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);
    // The following code was taken from the TXTParser
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        // text contents of the xhtml
        String line;
        while ((line = reader.readLine()) != null) {
            xhtml.startElement("p");
            xhtml.characters(line);
            xhtml.endElement("p");
        }
        xhtml.endDocument();
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) Charset(java.nio.charset.Charset) MediaType(org.apache.tika.mime.MediaType) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 5 with AutoDetectReader

use of org.apache.tika.detect.AutoDetectReader in project tika by apache.

the class ISATabUtils method parseStudy.

public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
    TikaInputStream tis = TikaInputStream.get(stream);
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
        CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        Iterator<CSVRecord> iterator = csvParser.iterator();
        xhtml.startElement("table");
        xhtml.startElement("thead");
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            for (int i = 0; i < record.size(); i++) {
                xhtml.startElement("th");
                xhtml.characters(record.get(i));
                xhtml.endElement("th");
            }
        }
        xhtml.endElement("thead");
        xhtml.startElement("tbody");
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            xhtml.startElement("tr");
            for (int j = 0; j < record.size(); j++) {
                xhtml.startElement("td");
                xhtml.characters(record.get(j));
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) CSVParser(org.apache.commons.csv.CSVParser) TikaInputStream(org.apache.tika.io.TikaInputStream) CSVRecord(org.apache.commons.csv.CSVRecord) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Aggregations

CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)6 AutoDetectReader (org.apache.tika.detect.AutoDetectReader)6 Charset (java.nio.charset.Charset)4 MediaType (org.apache.tika.mime.MediaType)4 TikaConfig (org.apache.tika.config.TikaConfig)3 CSVParser (org.apache.commons.csv.CSVParser)2 CSVRecord (org.apache.commons.csv.CSVRecord)2 TikaInputStream (org.apache.tika.io.TikaInputStream)2 AbstractEncodingDetectorParser (org.apache.tika.parser.AbstractEncodingDetectorParser)2 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)2 HTMLSchema (org.ccil.cowan.tagsoup.HTMLSchema)2 Schema (org.ccil.cowan.tagsoup.Schema)2 Renderer (com.uwyn.jhighlight.renderer.Renderer)1 StringReader (java.io.StringReader)1 InputSource (org.xml.sax.InputSource)1