Search in sources :

Example 26 with Tika

use of org.apache.tika.Tika in project nutch by apache.

the class FileDumper method dump.

 * Dumps the reverse engineered raw content from the provided segment
 * directories if a parent directory contains more than one segment, otherwise
 * a single segment can be passed as an argument.
 * @param outputDir
 *          the directory you wish to dump the raw content to. This directory
 *          will be created.
 * @param segmentRootDir
 *          a directory containing one or more segments.
 * @param mimeTypes
 *          an array of mime types we have to dump, all others will be
 *          filtered out.
 * @param flatDir
 *          a boolean flag specifying whether the output directory should contain
 *          only files instead of using nested directories to prevent naming
 *          conflicts.
 * @param mimeTypeStats
 *          a flag indicating whether mimetype stats should be displayed
 *          instead of dumping files.
 * @throws Exception
public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir, boolean mimeTypeStats, boolean reverseURLDump) throws Exception {
    if (mimeTypes == null)"Accepting all mimetypes.");
    // total file counts
    Map<String, Integer> typeCounts = new HashMap<>();
    // filtered file counts
    Map<String, Integer> filteredCounts = new HashMap<>();
    Configuration conf = NutchConfiguration.create();
    int fileCount = 0;
    File[] segmentDirs = segmentRootDir.listFiles(file -> file.canRead() && file.isDirectory());
    if (segmentDirs == null) {
        LOG.error("No segment directories found in [" + segmentRootDir.getAbsolutePath() + "]");
    for (File segment : segmentDirs) {"Processing segment: [" + segment.getAbsolutePath() + "]");
        DataOutputStream doutputStream = null;
        Map<String, String> filenameToUrl = new HashMap<String, String>();
        File segmentDir = new File(segment.getAbsolutePath(), Content.DIR_NAME);
        File[] partDirs = segmentDir.listFiles(file -> file.canRead() && file.isDirectory());
        if (partDirs == null) {
            LOG.warn("Skipping Corrupt Segment: [{}]", segment.getAbsolutePath());
        for (File partDir : partDirs) {
            try (FileSystem fs = FileSystem.get(conf)) {
                String segmentPath = partDir + "/data";
                Path file = new Path(segmentPath);
                if (!new File(file.toString()).exists()) {
                    LOG.warn("Skipping segment: [" + segmentPath + "]: no data directory present");
                SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file));
                Writable key = (Writable) reader.getKeyClass().newInstance();
                Content content = null;
                while ( {
                    content = new Content();
                    String url = key.toString();
                    String baseName = FilenameUtils.getBaseName(url);
                    String extension = FilenameUtils.getExtension(url);
                    if (extension == null || (extension != null && extension.equals(""))) {
                        extension = "html";
                    ByteArrayInputStream bas = null;
                    Boolean filter = false;
                    try {
                        bas = new ByteArrayInputStream(content.getContent());
                        String mimeType = new Tika().detect(content.getContent());
                        collectStats(typeCounts, mimeType);
                        if (mimeType != null) {
                            if (mimeTypes == null || Arrays.asList(mimeTypes).contains(mimeType)) {
                                collectStats(filteredCounts, mimeType);
                                filter = true;
                    } catch (Exception e) {
                        LOG.warn("Tika is unable to detect type for: [" + url + "]");
                    } finally {
                        if (bas != null) {
                            try {
                            } catch (Exception ignore) {
                    if (filter) {
                        if (!mimeTypeStats) {
                            String md5Ofurl = DumpFileUtil.getUrlMD5(url);
                            String fullDir = outputDir.getAbsolutePath();
                            if (!flatDir && !reverseURLDump) {
                                fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
                            if (!Strings.isNullOrEmpty(fullDir)) {
                                String outputFullPath;
                                if (reverseURLDump) {
                                    String[] reversedURL = TableUtil.reverseUrl(url).split(":");
                                    reversedURL[0] = reversedURL[0].replace('.', '/');
                                    String reversedURLPath = reversedURL[0] + "/" + DigestUtils.sha256Hex(url).toUpperCase();
                                    outputFullPath = String.format("%s/%s", fullDir, reversedURLPath);
                                    // We'll drop the trailing file name and create the nested structure if it doesn't already exist.
                                    String[] splitPath = outputFullPath.split("/");
                                    File fullOutputDir = new File(org.apache.commons.lang3.StringUtils.join(Arrays.copyOf(splitPath, splitPath.length - 1), "/"));
                                    if (!fullOutputDir.exists()) {
                                } else {
                                    outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
                                filenameToUrl.put(outputFullPath, url);
                                File outputFile = new File(outputFullPath);
                                if (!outputFile.exists()) {
                          "Writing: [" + outputFullPath + "]");
                                    // Modified to prevent FileNotFoundException (Invalid Argument)
                                    FileOutputStream output = null;
                                    try {
                                        output = new FileOutputStream(outputFile);
                                        IOUtils.write(content.getContent(), output);
                                    } catch (Exception e) {
                                        LOG.warn("Write Error: [" + outputFullPath + "]");
                                    } finally {
                                        if (output != null) {
                                            try {
                                            } catch (Exception ignore) {
                                } else {
                          "Skipping writing: [" + outputFullPath + "]: file already exists");
            } finally {
                if (doutputStream != null) {
                    try {
                    } catch (Exception ignore) {
        // save filenameToUrl in a json file for each segment there is one mapping file
        String filenameToUrlFilePath = String.format("%s/%s_filenameToUrl.json", outputDir.getAbsolutePath(), segment.getName());
        new ObjectMapper().writeValue(new File(filenameToUrlFilePath), filenameToUrl);
    }"Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
    if (mimeTypeStats) {
        System.out.println("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) HashMap(java.util.HashMap) DataOutputStream( Writable( Tika(org.apache.tika.Tika) SequenceFile( ByteArrayInputStream( Content(org.apache.nutch.protocol.Content) FileSystem(org.apache.hadoop.fs.FileSystem) FileOutputStream( SequenceFile( File( ObjectMapper(

Example 27 with Tika

use of org.apache.tika.Tika in project nutch by apache.

the class ZipTextExtractor method extractText.

public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
    String resultText = "";
    ZipInputStream zin = new ZipInputStream(input);
    ZipEntry entry;
    while ((entry = zin.getNextEntry()) != null) {
        if (!entry.isDirectory()) {
            int size = (int) entry.getSize();
            byte[] b = new byte[size];
            for (int x = 0; x < size; x++) {
                int err =;
                if (err != -1) {
                    b[x] = (byte) err;
            String newurl = url + "/";
            String fname = entry.getName();
            newurl += fname;
            URL aURL = new URL(newurl);
            String base = aURL.toString();
            int i = fname.lastIndexOf('.');
            if (i != -1) {
                // Trying to resolve the Mime-Type
                Tika tika = new Tika();
                String contentType = tika.detect(fname);
                try {
                    Metadata metadata = new Metadata();
                    metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
                    metadata.set(Response.CONTENT_TYPE, contentType);
                    Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
                    Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
                    ParseData theParseData = parse.getData();
                    Outlink[] theOutlinks = theParseData.getOutlinks();
                    for (int count = 0; count < theOutlinks.length; count++) {
                        outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
                    resultText += entry.getName() + " " + parse.getText() + " ";
                } catch (ParseException e) {
                    if (LOG.isInfoEnabled()) {
              "fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
    return resultText;
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) ZipEntry( Metadata(org.apache.nutch.metadata.Metadata) Tika(org.apache.tika.Tika) URL( ZipInputStream( ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) ParseException(org.apache.nutch.parse.ParseException)

Example 28 with Tika

use of org.apache.tika.Tika in project gitblit by gitblit.

the class RawServlet method processRequest.

	 * Retrieves the specified resource from the specified branch of the
	 * repository.
	 * @param request
	 * @param response
	 * @throws javax.servlet.ServletException
	 * @throws
private void processRequest(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    String path = request.getPathInfo();
    if (path.toLowerCase().endsWith(".git")) {
        // forward to url with trailing /
        // this is important for relative pages links
        response.sendRedirect(request.getServletPath() + path + "/");
    if (path.charAt(0) == '/') {
        // strip leading /
        path = path.substring(1);
    // determine repository and resource from url
    String repository = path;
    Repository r = null;
    int terminator = repository.length();
    do {
        repository = repository.substring(0, terminator);
        r = repositoryManager.getRepository(repository, false);
        terminator = repository.lastIndexOf('/');
    } while (r == null && terminator > -1);
    ServletContext context = request.getSession().getServletContext();
    try {
        if (r == null) {
            // repository not found!
            String mkd = MessageFormat.format("# Error\nSorry, no valid **repository** specified in this url: {0}!", path);
            error(response, mkd);
        // identify the branch
        String branch = getBranch(repository, request);
        if (StringUtils.isEmpty(branch)) {
            branch = r.getBranch();
            if (branch == null) {
                // no branches found!  empty?
                String mkd = MessageFormat.format("# Error\nSorry, no valid **branch** specified in this url: {0}!", path);
                error(response, mkd);
            } else {
                // redirect to default branch
                String base = request.getRequestURI();
                String url = base + branch + "/";
        // identify the requested path
        String requestedPath = getPath(repository, branch, request);
        // identify the commit
        RevCommit commit = JGitUtils.getCommit(r, branch);
        if (commit == null) {
            // branch not found!
            String mkd = MessageFormat.format("# Error\nSorry, the repository {0} does not have a **{1}** branch!", repository, branch);
            error(response, mkd);
        Map<String, String> quickContentTypes = new HashMap<>();
        quickContentTypes.put("html", "text/html");
        quickContentTypes.put("htm", "text/html");
        quickContentTypes.put("xml", "application/xml");
        quickContentTypes.put("json", "application/json");
        List<PathModel> pathEntries = JGitUtils.getFilesInPath(r, requestedPath, commit);
        if (pathEntries.isEmpty()) {
            // requested a specific resource
            String file = StringUtils.getLastPathElement(requestedPath);
            try {
                String ext = StringUtils.getFileExtension(file).toLowerCase();
                // We can't parse out an extension for classic "dotfiles", so make a general assumption that
                // they're text files to allow presenting them in browser instead of only for download.
                // However, that only holds for files with no other extension included, for files that happen
                // to start with a dot but also include an extension, process the extension normally.
                // This logic covers .gitattributes, .gitignore, .zshrc, etc., but does not cover .mongorc.js, .zshrc.bak
                boolean isExtensionlessDotfile = file.charAt(0) == '.' && (file.length() == 1 || file.indexOf('.', 1) < 0);
                String contentType = isExtensionlessDotfile ? "text/plain" : quickContentTypes.get(ext);
                if (contentType == null) {
                    List<String> exts = runtimeManager.getSettings().getStrings(Keys.web.prettyPrintExtensions);
                    if (exts.contains(ext)) {
                        // extension is a registered text type for pretty printing
                        contentType = "text/plain";
                    } else {
                        // query Tika for the content type
                        Tika tika = new Tika();
                        contentType = tika.detect(file);
                if (contentType == null) {
                    // ask the container for the content type
                    contentType = context.getMimeType(requestedPath);
                    if (contentType == null) {
                        // still unknown content type, assume binary
                        contentType = "application/octet-stream";
                if (isTextType(contentType) || isTextDataType(contentType)) {
                    // load, interpret, and serve text content as UTF-8
                    String[] encodings = runtimeManager.getSettings().getStrings(Keys.web.blobEncodings).toArray(new String[0]);
                    String content = JGitUtils.getStringContent(r, commit.getTree(), requestedPath, encodings);
                    if (content == null) {
                        logger.error("RawServlet Failed to load {} {} {}", repository, commit.getName(), path);
                        notFound(response, requestedPath, branch);
                    byte[] bytes = content.getBytes(Constants.ENCODING);
                    setContentType(response, contentType);
                    ByteArrayInputStream is = new ByteArrayInputStream(bytes);
                    sendContent(response, JGitUtils.getCommitDate(commit), is);
                } else {
                    // stream binary content directly from the repository
                    if (!streamFromRepo(request, response, r, commit, requestedPath)) {
                        logger.error("RawServlet Failed to load {} {} {}", repository, commit.getName(), path);
                        notFound(response, requestedPath, branch);
            } catch (Exception e) {
                logger.error(null, e);
        } else {
            // path request
            if (!request.getPathInfo().endsWith("/")) {
                // redirect to trailing '/' url
                response.sendRedirect(request.getServletPath() + request.getPathInfo() + "/");
            if (renderIndex()) {
                // locate and render an index file
                Map<String, String> names = new TreeMap<String, String>();
                for (PathModel entry : pathEntries) {
                List<String> extensions = new ArrayList<String>();
                String content = null;
                for (String ext : extensions) {
                    String key = "index." + ext;
                    if (names.containsKey(key)) {
                        String fileName = names.get(key);
                        String fullPath = fileName;
                        if (!requestedPath.isEmpty()) {
                            fullPath = requestedPath + "/" + fileName;
                        String[] encodings = runtimeManager.getSettings().getStrings(Keys.web.blobEncodings).toArray(new String[0]);
                        String stringContent = JGitUtils.getStringContent(r, commit.getTree(), fullPath, encodings);
                        if (stringContent == null) {
                        content = stringContent;
                        requestedPath = fullPath;
                response.setContentType("text/html; charset=" + Constants.ENCODING);
                byte[] bytes = content.getBytes(Constants.ENCODING);
                ByteArrayInputStream is = new ByteArrayInputStream(bytes);
                sendContent(response, JGitUtils.getCommitDate(commit), is);
        // no content, document list or 404 page
        if (pathEntries.isEmpty()) {
            // default 404 page
            notFound(response, requestedPath, branch);
        } else {
            // directory list
            response.getWriter().append("<style>table th, table td { min-width: 150px; text-align: left; }</style>");
            String pattern = "<tr><td><a href=\"{0}/{1}\">{1}</a></td><td>{2}</td><td>{3}</td></tr>";
            final ByteFormat byteFormat = new ByteFormat();
            if (!pathEntries.isEmpty()) {
                if (pathEntries.get(0).path.indexOf('/') > -1) {
                    // we are in a subdirectory, add parent directory link
                    String pp = URLEncoder.encode(requestedPath, Constants.ENCODING);
                    pathEntries.add(0, new PathModel("..", pp + "/..", null, 0, FileMode.TREE.getBits(), null, null));
            String basePath = request.getServletPath() + request.getPathInfo();
            if (basePath.charAt(basePath.length() - 1) == '/') {
                // strip trailing slash
                basePath = basePath.substring(0, basePath.length() - 1);
            for (PathModel entry : pathEntries) {
                String pp = URLEncoder.encode(, Constants.ENCODING);
                response.getWriter().append(MessageFormat.format(pattern, basePath, pp, JGitUtils.getPermissionsFromMode(entry.mode), entry.isFile() ? byteFormat.format(entry.size) : ""));
    } catch (Throwable t) {
        logger.error("Failed to write page to client", t);
    } finally {
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Tika(org.apache.tika.Tika) TreeMap(java.util.TreeMap) ServletException(javax.servlet.ServletException) ParseException(java.text.ParseException) IOException( UnsupportedEncodingException( Repository(org.eclipse.jgit.lib.Repository) PathModel(com.gitblit.models.PathModel) ByteArrayInputStream( ByteFormat(com.gitblit.utils.ByteFormat) ServletContext(javax.servlet.ServletContext) RevCommit(org.eclipse.jgit.revwalk.RevCommit)

Example 29 with Tika

use of org.apache.tika.Tika in project lucene-solr by apache.

the class MailEntityProcessor method addPartToDocument.

public void addPartToDocument(Part part, Map<String, Object> row, boolean outerMost) throws Exception {
    if (part instanceof Message) {
        addEnvelopeToDocument(part, row);
    String ct = part.getContentType().toLowerCase(Locale.ROOT);
    ContentType ctype = new ContentType(ct);
    if (part.isMimeType("multipart/*")) {
        Object content = part.getContent();
        if (content != null && content instanceof Multipart) {
            Multipart mp = (Multipart) part.getContent();
            int count = mp.getCount();
            if (part.isMimeType("multipart/alternative"))
                count = 1;
            for (int i = 0; i < count; i++) addPartToDocument(mp.getBodyPart(i), row, false);
        } else {
            LOG.warn("Multipart content is a not an instance of Multipart! Content is: " + (content != null ? content.getClass().getName() : "null") + ". Typically, this is due to the Java Activation JAR being loaded by the wrong classloader.");
    } else if (part.isMimeType("message/rfc822")) {
        addPartToDocument((Part) part.getContent(), row, false);
    } else {
        String disp = part.getDisposition();
        if (includeContent && !(disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT))) {
            InputStream is = part.getInputStream();
            Metadata contentTypeHint = new Metadata();
            contentTypeHint.set(Metadata.CONTENT_TYPE, ctype.getBaseType().toLowerCase(Locale.ENGLISH));
            String content = (new Tika()).parseToString(is, contentTypeHint);
            if (row.get(CONTENT) == null)
                row.put(CONTENT, new ArrayList<String>());
            List<String> contents = (List<String>) row.get(CONTENT);
            row.put(CONTENT, contents);
        if (!processAttachment || disp == null || !disp.equalsIgnoreCase(Part.ATTACHMENT))
        InputStream is = part.getInputStream();
        String fileName = part.getFileName();
        Metadata contentTypeHint = new Metadata();
        contentTypeHint.set(Metadata.CONTENT_TYPE, ctype.getBaseType().toLowerCase(Locale.ENGLISH));
        String content = (new Tika()).parseToString(is, contentTypeHint);
        if (content == null || content.trim().length() == 0)
        if (row.get(ATTACHMENT) == null)
            row.put(ATTACHMENT, new ArrayList<String>());
        List<String> contents = (List<String>) row.get(ATTACHMENT);
        row.put(ATTACHMENT, contents);
        if (row.get(ATTACHMENT_NAMES) == null)
            row.put(ATTACHMENT_NAMES, new ArrayList<String>());
        List<String> names = (List<String>) row.get(ATTACHMENT_NAMES);
        row.put(ATTACHMENT_NAMES, names);
Also used : MimeMessage(javax.mail.internet.MimeMessage) IMAPMessage(com.sun.mail.imap.IMAPMessage) ContentType(javax.mail.internet.ContentType) InputStream( Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika)

Example 30 with Tika

use of org.apache.tika.Tika in project ddf by codice.

the class URLResourceReader method getMimeType.

private String getMimeType(URI resourceURI, String productName) throws MimeTypeResolutionException, IOException {
    // Determine the mime type in a hierarchical fashion. The hierarchy is based on the
    // most accurate mime type resolution being used and lesser accurate approaches being
    // used
    // if a mime type is not resolved.
    // The approaches, in order, are:
    // 1. Try using the DDF MimeTypeMapper so that custom MimeTypeResolvers are used
    // 2. Try using Apache Tika directly on the URL
    String mimeType = null;
    if (mimeTypeMapper == null) {
        LOGGER.debug("mimeTypeMapper is NULL");
    } else {
        // Extract the file extension (if any) from the URL's filename
        String fileExtension = FilenameUtils.getExtension(productName);
        mimeType = mimeTypeMapper.getMimeTypeForFileExtension(fileExtension);
    // mime type resolution than just file extension mime type mapping
    if ((mimeType == null || mimeType.isEmpty() || mimeType.equals(DEFAULT_MIME_TYPE)) && URL_FILE_SCHEME.equalsIgnoreCase(resourceURI.getScheme())) {
        // Use Apache Tika to detect mime type from URL
        Tika tika = new Tika();
        mimeType = tika.detect(resourceURI.toURL());
        LOGGER.debug("Tika determined mimeType for url = {}", mimeType);
    } else {
        LOGGER.debug("mimeType = {} set by MimeTypeMapper", mimeType);
    // never be returned.
    if (mimeType == null || mimeType.equals("content/unknown")) {
        mimeType = "application/unknown";
    LOGGER.debug("mimeType set to: {}", mimeType);
    return mimeType;
Also used : Tika(org.apache.tika.Tika)


Tika (org.apache.tika.Tika)54 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)29 ByteArrayInputStream ( TikaTest (org.apache.tika.TikaTest)12 TikaConfig (org.apache.tika.config.TikaConfig)12 File ( InputStream ( URL ( TikaInputStream ( IOException ( HashSet (java.util.HashSet)4 Ignore (org.junit.Ignore)4 FileInputStream ( ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 Content (org.apache.nutch.protocol.Content)3 Before (org.junit.Before)3 FileOutputStream ( UnsupportedEncodingException (