use of org.apache.tika.Tika in project nutch by apache.
the class FileDumper method dump.
/**
* Dumps the reverse engineered raw content from the provided segment
* directories if a parent directory contains more than one segment, otherwise
* a single segment can be passed as an argument.
*
* @param outputDir
* the directory you wish to dump the raw content to. This directory
* will be created.
* @param segmentRootDir
* a directory containing one or more segments.
* @param mimeTypes
* an array of mime types we have to dump, all others will be
* filtered out.
* @param flatDir
* a boolean flag specifying whether the output directory should contain
* only files instead of using nested directories to prevent naming
* conflicts.
* @param mimeTypeStats
* a flag indicating whether mimetype stats should be displayed
* instead of dumping files.
* @throws Exception
*/
public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir, boolean mimeTypeStats, boolean reverseURLDump) throws Exception {
if (mimeTypes == null)
LOG.info("Accepting all mimetypes.");
// total file counts
Map<String, Integer> typeCounts = new HashMap<>();
// filtered file counts
Map<String, Integer> filteredCounts = new HashMap<>();
Configuration conf = NutchConfiguration.create();
int fileCount = 0;
File[] segmentDirs = segmentRootDir.listFiles(file -> file.canRead() && file.isDirectory());
if (segmentDirs == null) {
LOG.error("No segment directories found in [" + segmentRootDir.getAbsolutePath() + "]");
return;
}
for (File segment : segmentDirs) {
LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
DataOutputStream doutputStream = null;
Map<String, String> filenameToUrl = new HashMap<String, String>();
File segmentDir = new File(segment.getAbsolutePath(), Content.DIR_NAME);
File[] partDirs = segmentDir.listFiles(file -> file.canRead() && file.isDirectory());
if (partDirs == null) {
LOG.warn("Skipping Corrupt Segment: [{}]", segment.getAbsolutePath());
continue;
}
for (File partDir : partDirs) {
try (FileSystem fs = FileSystem.get(conf)) {
String segmentPath = partDir + "/data";
Path file = new Path(segmentPath);
if (!new File(file.toString()).exists()) {
LOG.warn("Skipping segment: [" + segmentPath + "]: no data directory present");
continue;
}
SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file));
Writable key = (Writable) reader.getKeyClass().newInstance();
Content content = null;
while (reader.next(key)) {
content = new Content();
reader.getCurrentValue(content);
String url = key.toString();
String baseName = FilenameUtils.getBaseName(url);
String extension = FilenameUtils.getExtension(url);
if (extension == null || (extension != null && extension.equals(""))) {
extension = "html";
}
ByteArrayInputStream bas = null;
Boolean filter = false;
try {
bas = new ByteArrayInputStream(content.getContent());
String mimeType = new Tika().detect(content.getContent());
collectStats(typeCounts, mimeType);
if (mimeType != null) {
if (mimeTypes == null || Arrays.asList(mimeTypes).contains(mimeType)) {
collectStats(filteredCounts, mimeType);
filter = true;
}
}
} catch (Exception e) {
e.printStackTrace();
LOG.warn("Tika is unable to detect type for: [" + url + "]");
} finally {
if (bas != null) {
try {
bas.close();
} catch (Exception ignore) {
}
}
}
if (filter) {
if (!mimeTypeStats) {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
String fullDir = outputDir.getAbsolutePath();
if (!flatDir && !reverseURLDump) {
fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
}
if (!Strings.isNullOrEmpty(fullDir)) {
String outputFullPath;
if (reverseURLDump) {
String[] reversedURL = TableUtil.reverseUrl(url).split(":");
reversedURL[0] = reversedURL[0].replace('.', '/');
String reversedURLPath = reversedURL[0] + "/" + DigestUtils.sha256Hex(url).toUpperCase();
outputFullPath = String.format("%s/%s", fullDir, reversedURLPath);
// We'll drop the trailing file name and create the nested structure if it doesn't already exist.
String[] splitPath = outputFullPath.split("/");
File fullOutputDir = new File(org.apache.commons.lang3.StringUtils.join(Arrays.copyOf(splitPath, splitPath.length - 1), "/"));
if (!fullOutputDir.exists()) {
fullOutputDir.mkdirs();
}
} else {
outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
}
filenameToUrl.put(outputFullPath, url);
File outputFile = new File(outputFullPath);
if (!outputFile.exists()) {
LOG.info("Writing: [" + outputFullPath + "]");
// Modified to prevent FileNotFoundException (Invalid Argument)
FileOutputStream output = null;
try {
output = new FileOutputStream(outputFile);
IOUtils.write(content.getContent(), output);
} catch (Exception e) {
LOG.warn("Write Error: [" + outputFullPath + "]");
e.printStackTrace();
} finally {
if (output != null) {
output.flush();
try {
output.close();
} catch (Exception ignore) {
}
}
}
fileCount++;
} else {
LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
}
}
}
}
}
reader.close();
} finally {
if (doutputStream != null) {
try {
doutputStream.close();
} catch (Exception ignore) {
}
}
}
}
// save filenameToUrl in a json file for each segment there is one mapping file
String filenameToUrlFilePath = String.format("%s/%s_filenameToUrl.json", outputDir.getAbsolutePath(), segment.getName());
new ObjectMapper().writeValue(new File(filenameToUrlFilePath), filenameToUrl);
}
LOG.info("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
if (mimeTypeStats) {
System.out.println("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
}
}
use of org.apache.tika.Tika in project nutch by apache.
the class ZipTextExtractor method extractText.
public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
String resultText = "";
ZipInputStream zin = new ZipInputStream(input);
ZipEntry entry;
while ((entry = zin.getNextEntry()) != null) {
if (!entry.isDirectory()) {
int size = (int) entry.getSize();
byte[] b = new byte[size];
for (int x = 0; x < size; x++) {
int err = zin.read();
if (err != -1) {
b[x] = (byte) err;
}
}
String newurl = url + "/";
String fname = entry.getName();
newurl += fname;
URL aURL = new URL(newurl);
String base = aURL.toString();
int i = fname.lastIndexOf('.');
if (i != -1) {
// Trying to resolve the Mime-Type
Tika tika = new Tika();
String contentType = tika.detect(fname);
try {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
metadata.set(Response.CONTENT_TYPE, contentType);
Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
ParseData theParseData = parse.getData();
Outlink[] theOutlinks = theParseData.getOutlinks();
for (int count = 0; count < theOutlinks.length; count++) {
outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
}
resultText += entry.getName() + " " + parse.getText() + " ";
} catch (ParseException e) {
if (LOG.isInfoEnabled()) {
LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
}
}
}
}
}
return resultText;
}
use of org.apache.tika.Tika in project gitblit by gitblit.
the class RawServlet method processRequest.
/**
* Retrieves the specified resource from the specified branch of the
* repository.
*
* @param request
* @param response
* @throws javax.servlet.ServletException
* @throws java.io.IOException
*/
private void processRequest(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
String path = request.getPathInfo();
if (path.toLowerCase().endsWith(".git")) {
// forward to url with trailing /
// this is important for relative pages links
response.sendRedirect(request.getServletPath() + path + "/");
return;
}
if (path.charAt(0) == '/') {
// strip leading /
path = path.substring(1);
}
// determine repository and resource from url
String repository = path;
Repository r = null;
int terminator = repository.length();
do {
repository = repository.substring(0, terminator);
r = repositoryManager.getRepository(repository, false);
terminator = repository.lastIndexOf('/');
} while (r == null && terminator > -1);
ServletContext context = request.getSession().getServletContext();
try {
if (r == null) {
// repository not found!
String mkd = MessageFormat.format("# Error\nSorry, no valid **repository** specified in this url: {0}!", path);
error(response, mkd);
return;
}
// identify the branch
String branch = getBranch(repository, request);
if (StringUtils.isEmpty(branch)) {
branch = r.getBranch();
if (branch == null) {
// no branches found! empty?
String mkd = MessageFormat.format("# Error\nSorry, no valid **branch** specified in this url: {0}!", path);
error(response, mkd);
} else {
// redirect to default branch
String base = request.getRequestURI();
String url = base + branch + "/";
response.sendRedirect(url);
}
return;
}
// identify the requested path
String requestedPath = getPath(repository, branch, request);
// identify the commit
RevCommit commit = JGitUtils.getCommit(r, branch);
if (commit == null) {
// branch not found!
String mkd = MessageFormat.format("# Error\nSorry, the repository {0} does not have a **{1}** branch!", repository, branch);
error(response, mkd);
return;
}
Map<String, String> quickContentTypes = new HashMap<>();
quickContentTypes.put("html", "text/html");
quickContentTypes.put("htm", "text/html");
quickContentTypes.put("xml", "application/xml");
quickContentTypes.put("json", "application/json");
List<PathModel> pathEntries = JGitUtils.getFilesInPath(r, requestedPath, commit);
if (pathEntries.isEmpty()) {
// requested a specific resource
String file = StringUtils.getLastPathElement(requestedPath);
try {
String ext = StringUtils.getFileExtension(file).toLowerCase();
// We can't parse out an extension for classic "dotfiles", so make a general assumption that
// they're text files to allow presenting them in browser instead of only for download.
//
// However, that only holds for files with no other extension included, for files that happen
// to start with a dot but also include an extension, process the extension normally.
// This logic covers .gitattributes, .gitignore, .zshrc, etc., but does not cover .mongorc.js, .zshrc.bak
boolean isExtensionlessDotfile = file.charAt(0) == '.' && (file.length() == 1 || file.indexOf('.', 1) < 0);
String contentType = isExtensionlessDotfile ? "text/plain" : quickContentTypes.get(ext);
if (contentType == null) {
List<String> exts = runtimeManager.getSettings().getStrings(Keys.web.prettyPrintExtensions);
if (exts.contains(ext)) {
// extension is a registered text type for pretty printing
contentType = "text/plain";
} else {
// query Tika for the content type
Tika tika = new Tika();
contentType = tika.detect(file);
}
}
if (contentType == null) {
// ask the container for the content type
contentType = context.getMimeType(requestedPath);
if (contentType == null) {
// still unknown content type, assume binary
contentType = "application/octet-stream";
}
}
if (isTextType(contentType) || isTextDataType(contentType)) {
// load, interpret, and serve text content as UTF-8
String[] encodings = runtimeManager.getSettings().getStrings(Keys.web.blobEncodings).toArray(new String[0]);
String content = JGitUtils.getStringContent(r, commit.getTree(), requestedPath, encodings);
if (content == null) {
logger.error("RawServlet Failed to load {} {} {}", repository, commit.getName(), path);
notFound(response, requestedPath, branch);
return;
}
byte[] bytes = content.getBytes(Constants.ENCODING);
setContentType(response, contentType);
response.setContentLength(bytes.length);
ByteArrayInputStream is = new ByteArrayInputStream(bytes);
sendContent(response, JGitUtils.getCommitDate(commit), is);
} else {
// stream binary content directly from the repository
if (!streamFromRepo(request, response, r, commit, requestedPath)) {
logger.error("RawServlet Failed to load {} {} {}", repository, commit.getName(), path);
notFound(response, requestedPath, branch);
}
}
return;
} catch (Exception e) {
logger.error(null, e);
}
} else {
// path request
if (!request.getPathInfo().endsWith("/")) {
// redirect to trailing '/' url
response.sendRedirect(request.getServletPath() + request.getPathInfo() + "/");
return;
}
if (renderIndex()) {
// locate and render an index file
Map<String, String> names = new TreeMap<String, String>();
for (PathModel entry : pathEntries) {
names.put(entry.name.toLowerCase(), entry.name);
}
List<String> extensions = new ArrayList<String>();
extensions.add("html");
extensions.add("htm");
String content = null;
for (String ext : extensions) {
String key = "index." + ext;
if (names.containsKey(key)) {
String fileName = names.get(key);
String fullPath = fileName;
if (!requestedPath.isEmpty()) {
fullPath = requestedPath + "/" + fileName;
}
String[] encodings = runtimeManager.getSettings().getStrings(Keys.web.blobEncodings).toArray(new String[0]);
String stringContent = JGitUtils.getStringContent(r, commit.getTree(), fullPath, encodings);
if (stringContent == null) {
continue;
}
content = stringContent;
requestedPath = fullPath;
break;
}
}
response.setContentType("text/html; charset=" + Constants.ENCODING);
byte[] bytes = content.getBytes(Constants.ENCODING);
response.setContentLength(bytes.length);
ByteArrayInputStream is = new ByteArrayInputStream(bytes);
sendContent(response, JGitUtils.getCommitDate(commit), is);
return;
}
}
// no content, document list or 404 page
if (pathEntries.isEmpty()) {
// default 404 page
notFound(response, requestedPath, branch);
return;
} else {
//
// directory list
//
response.setContentType("text/html");
response.getWriter().append("<style>table th, table td { min-width: 150px; text-align: left; }</style>");
response.getWriter().append("<table>");
response.getWriter().append("<thead><tr><th>path</th><th>mode</th><th>size</th></tr>");
response.getWriter().append("</thead>");
response.getWriter().append("<tbody>");
String pattern = "<tr><td><a href=\"{0}/{1}\">{1}</a></td><td>{2}</td><td>{3}</td></tr>";
final ByteFormat byteFormat = new ByteFormat();
if (!pathEntries.isEmpty()) {
if (pathEntries.get(0).path.indexOf('/') > -1) {
// we are in a subdirectory, add parent directory link
String pp = URLEncoder.encode(requestedPath, Constants.ENCODING);
pathEntries.add(0, new PathModel("..", pp + "/..", null, 0, FileMode.TREE.getBits(), null, null));
}
}
String basePath = request.getServletPath() + request.getPathInfo();
if (basePath.charAt(basePath.length() - 1) == '/') {
// strip trailing slash
basePath = basePath.substring(0, basePath.length() - 1);
}
for (PathModel entry : pathEntries) {
String pp = URLEncoder.encode(entry.name, Constants.ENCODING);
response.getWriter().append(MessageFormat.format(pattern, basePath, pp, JGitUtils.getPermissionsFromMode(entry.mode), entry.isFile() ? byteFormat.format(entry.size) : ""));
}
response.getWriter().append("</tbody>");
response.getWriter().append("</table>");
}
} catch (Throwable t) {
logger.error("Failed to write page to client", t);
} finally {
r.close();
}
}
use of org.apache.tika.Tika in project lucene-solr by apache.
the class MailEntityProcessor method addPartToDocument.
public void addPartToDocument(Part part, Map<String, Object> row, boolean outerMost) throws Exception {
if (part instanceof Message) {
addEnvelopeToDocument(part, row);
}
String ct = part.getContentType().toLowerCase(Locale.ROOT);
ContentType ctype = new ContentType(ct);
if (part.isMimeType("multipart/*")) {
Object content = part.getContent();
if (content != null && content instanceof Multipart) {
Multipart mp = (Multipart) part.getContent();
int count = mp.getCount();
if (part.isMimeType("multipart/alternative"))
count = 1;
for (int i = 0; i < count; i++) addPartToDocument(mp.getBodyPart(i), row, false);
} else {
LOG.warn("Multipart content is a not an instance of Multipart! Content is: " + (content != null ? content.getClass().getName() : "null") + ". Typically, this is due to the Java Activation JAR being loaded by the wrong classloader.");
}
} else if (part.isMimeType("message/rfc822")) {
addPartToDocument((Part) part.getContent(), row, false);
} else {
String disp = part.getDisposition();
if (includeContent && !(disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT))) {
InputStream is = part.getInputStream();
Metadata contentTypeHint = new Metadata();
contentTypeHint.set(Metadata.CONTENT_TYPE, ctype.getBaseType().toLowerCase(Locale.ENGLISH));
String content = (new Tika()).parseToString(is, contentTypeHint);
if (row.get(CONTENT) == null)
row.put(CONTENT, new ArrayList<String>());
List<String> contents = (List<String>) row.get(CONTENT);
contents.add(content.trim());
row.put(CONTENT, contents);
}
if (!processAttachment || disp == null || !disp.equalsIgnoreCase(Part.ATTACHMENT))
return;
InputStream is = part.getInputStream();
String fileName = part.getFileName();
Metadata contentTypeHint = new Metadata();
contentTypeHint.set(Metadata.CONTENT_TYPE, ctype.getBaseType().toLowerCase(Locale.ENGLISH));
String content = (new Tika()).parseToString(is, contentTypeHint);
if (content == null || content.trim().length() == 0)
return;
if (row.get(ATTACHMENT) == null)
row.put(ATTACHMENT, new ArrayList<String>());
List<String> contents = (List<String>) row.get(ATTACHMENT);
contents.add(content.trim());
row.put(ATTACHMENT, contents);
if (row.get(ATTACHMENT_NAMES) == null)
row.put(ATTACHMENT_NAMES, new ArrayList<String>());
List<String> names = (List<String>) row.get(ATTACHMENT_NAMES);
names.add(fileName);
row.put(ATTACHMENT_NAMES, names);
}
}
use of org.apache.tika.Tika in project ddf by codice.
the class URLResourceReader method getMimeType.
private String getMimeType(URI resourceURI, String productName) throws MimeTypeResolutionException, IOException {
// Determine the mime type in a hierarchical fashion. The hierarchy is based on the
// most accurate mime type resolution being used and lesser accurate approaches being
// used
// if a mime type is not resolved.
// The approaches, in order, are:
// 1. Try using the DDF MimeTypeMapper so that custom MimeTypeResolvers are used
// 2. Try using Apache Tika directly on the URL
String mimeType = null;
if (mimeTypeMapper == null) {
LOGGER.debug("mimeTypeMapper is NULL");
} else {
// Extract the file extension (if any) from the URL's filename
String fileExtension = FilenameUtils.getExtension(productName);
mimeType = mimeTypeMapper.getMimeTypeForFileExtension(fileExtension);
}
// mime type resolution than just file extension mime type mapping
if ((mimeType == null || mimeType.isEmpty() || mimeType.equals(DEFAULT_MIME_TYPE)) && URL_FILE_SCHEME.equalsIgnoreCase(resourceURI.getScheme())) {
// Use Apache Tika to detect mime type from URL
Tika tika = new Tika();
mimeType = tika.detect(resourceURI.toURL());
LOGGER.debug("Tika determined mimeType for url = {}", mimeType);
} else {
LOGGER.debug("mimeType = {} set by MimeTypeMapper", mimeType);
}
// never be returned.
if (mimeType == null || mimeType.equals("content/unknown")) {
mimeType = "application/unknown";
}
LOGGER.debug("mimeType set to: {}", mimeType);
return mimeType;
}
Aggregations