use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project ignite by apache.
the class IgniteHadoopTestSuite method download.
/**
* Downloads and extracts an Apache product.
*
* @param appName Name of application for log messages.
* @param homeVariable Pointer to home directory of the component.
* @param downloadPath Relative download path of tar package.
* @param destName Local directory name to install component.
* @throws Exception If failed.
*/
private static void download(String appName, String homeVariable, String downloadPath, String destName) throws Exception {
String homeVal = IgniteSystemProperties.getString(homeVariable);
if (!F.isEmpty(homeVal) && new File(homeVal).isDirectory()) {
X.println(homeVariable + " is set to: " + homeVal);
return;
}
List<String> urls = F.asList("http://archive.apache.org/dist/", "http://apache-mirror.rbc.ru/pub/apache/", "http://www.eu.apache.org/dist/", "http://www.us.apache.org/dist/");
String tmpPath = System.getProperty("java.io.tmpdir");
X.println("tmp: " + tmpPath);
final File install = new File(tmpPath + File.separatorChar + "__hadoop");
final File home = new File(install, destName);
X.println("Setting " + homeVariable + " to " + home.getAbsolutePath());
System.setProperty(homeVariable, home.getAbsolutePath());
final File successFile = new File(home, "__success");
if (home.exists()) {
if (successFile.exists()) {
X.println(appName + " distribution already exists.");
return;
}
X.println(appName + " distribution is invalid and it will be deleted.");
if (!U.delete(home))
throw new IOException("Failed to delete directory: " + home.getAbsolutePath());
}
for (String url : urls) {
if (!(install.exists() || install.mkdirs()))
throw new IOException("Failed to create directory: " + install.getAbsolutePath());
URL u = new URL(url + downloadPath);
X.println("Attempting to download from: " + u);
try {
URLConnection c = u.openConnection();
c.connect();
try (TarArchiveInputStream in = new TarArchiveInputStream(new GzipCompressorInputStream(new BufferedInputStream(c.getInputStream(), 32 * 1024)))) {
TarArchiveEntry entry;
while ((entry = in.getNextTarEntry()) != null) {
File dest = new File(install, entry.getName());
if (entry.isDirectory()) {
if (!dest.mkdirs())
throw new IllegalStateException();
} else if (entry.isSymbolicLink()) {
// Important: in Hadoop installation there are symlinks, we need to create them:
Path theLinkItself = Paths.get(install.getAbsolutePath(), entry.getName());
Path linkTarget = Paths.get(entry.getLinkName());
Files.createSymbolicLink(theLinkItself, linkTarget);
} else {
File parent = dest.getParentFile();
if (!(parent.exists() || parent.mkdirs()))
throw new IllegalStateException();
X.print(" [" + dest);
try (BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(dest, false), 128 * 1024)) {
U.copy(in, out);
out.flush();
}
Files.setPosixFilePermissions(dest.toPath(), modeToPermissionSet(entry.getMode()));
X.println("]");
}
}
}
if (successFile.createNewFile())
return;
} catch (Exception e) {
e.printStackTrace();
U.delete(home);
}
}
throw new IllegalStateException("Failed to install " + appName + ".");
}
use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project nutch by apache.
the class CommonCrawlDataDumper method dump.
/**
* Dumps the reverse engineered CBOR content from the provided segment
* directories if a parent directory contains more than one segment,
* otherwise a single segment can be passed as an argument. If the boolean
* argument is provided then the CBOR is also zipped.
*
* @param outputDir the directory you wish to dump the raw content to. This
* directory will be created.
* @param segmentRootDir a directory containing one or more segments.
* @param linkdb Path to linkdb.
* @param gzip a boolean flag indicating whether the CBOR content should also
* be gzipped.
* @param epochFilename if {@code true}, output files will be names using the epoch time (in milliseconds).
* @param extension a file extension to use with output documents.
* @throws Exception if any exception occurs.
*/
public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes, boolean epochFilename, String extension, boolean warc) throws Exception {
if (gzip) {
LOG.info("Gzipping CBOR data has been skipped");
}
// total file counts
Map<String, Integer> typeCounts = new HashMap<>();
// filtered file counters
Map<String, Integer> filteredCounts = new HashMap<>();
Configuration nutchConfig = NutchConfiguration.create();
Path segmentRootPath = new Path(segmentRootDir.toString());
FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);
// get all paths
List<Path> parts = new ArrayList<>();
RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}" + File.separator + "data";
while (files.hasNext()) {
LocatedFileStatus next = files.next();
if (next.isFile()) {
Path path = next.getPath();
if (path.toString().matches(partPattern)) {
parts.add(path);
}
}
}
LinkDbReader linkDbReader = null;
if (linkdb != null) {
linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
}
if (parts == null || parts.size() == 0) {
LOG.error("No segment directories found in {} ", segmentRootDir.getAbsolutePath());
System.exit(1);
}
LOG.info("Found {} segment parts", parts.size());
if (gzip && !warc) {
fileList = new ArrayList<>();
constructNewStream(outputDir);
}
for (Path segmentPart : parts) {
LOG.info("Processing segment Part : [ {} ]", segmentPart);
try {
SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig, SequenceFile.Reader.file(segmentPart));
Writable key = (Writable) reader.getKeyClass().newInstance();
Content content = null;
while (reader.next(key)) {
content = new Content();
reader.getCurrentValue(content);
Metadata metadata = content.getMetadata();
String url = key.toString();
String baseName = FilenameUtils.getBaseName(url);
String extensionName = FilenameUtils.getExtension(url);
if (!extension.isEmpty()) {
extensionName = extension;
} else if ((extensionName == null) || extensionName.isEmpty()) {
extensionName = "html";
}
String outputFullPath = null;
String outputRelativePath = null;
String filename = null;
String timestamp = null;
String reverseKey = null;
if (epochFilename || config.getReverseKey()) {
try {
long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(getDate(metadata.get("Date"))).getTime();
timestamp = String.valueOf(epoch);
} catch (ParseException pe) {
LOG.warn(pe.getMessage());
}
reverseKey = reverseUrl(url);
config.setReverseKeyValue(reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url) + "_" + timestamp);
}
if (!warc) {
if (epochFilename) {
outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey, url, timestamp, extensionName, !gzip);
outputRelativePath = outputFullPath.substring(0, outputFullPath.lastIndexOf(File.separator) - 1);
filename = content.getMetadata().get(Metadata.DATE) + "." + extensionName;
} else {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, !gzip);
filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
outputFullPath = String.format("%s/%s", fullDir, filename);
String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2];
String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1];
outputRelativePath = firstLevelDirName + secondLevelDirName;
}
}
// Encode all filetypes if no mimetypes have been given
Boolean filter = (mimeTypes == null);
String jsonData = "";
try {
String mimeType = new Tika().detect(content.getContent());
// Maps file to JSON-based structure
// there may be duplicates, so using set
Set<String> inUrls = null;
if (linkDbReader != null) {
Inlinks inlinks = linkDbReader.getInlinks((Text) key);
if (inlinks != null) {
Iterator<Inlink> iterator = inlinks.iterator();
inUrls = new LinkedHashSet<>();
while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()) {
inUrls.add(iterator.next().getFromUrl());
}
}
}
// TODO: Make this Jackson Format implementation reusable
try (CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
if (inUrls != null) {
format.setInLinks(new ArrayList<>(inUrls));
}
jsonData = format.getJsonData(url, content, metadata);
}
collectStats(typeCounts, mimeType);
// collects statistics for the given mimetypes
if ((mimeType != null) && (mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) {
collectStats(filteredCounts, mimeType);
filter = true;
}
} catch (IOException ioe) {
LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
return;
}
if (!warc) {
if (filter) {
byte[] byteData = serializeCBORData(jsonData);
if (!gzip) {
File outputFile = new File(outputFullPath);
if (outputFile.exists()) {
LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
} else {
LOG.info("Writing: [" + outputFullPath + "]");
IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
}
} else {
if (fileList.contains(outputFullPath)) {
LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
} else {
fileList.add(outputFullPath);
LOG.info("Compressing: [" + outputFullPath + "]");
// TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
TarArchiveEntry tarEntry = new TarArchiveEntry(outputRelativePath + File.separator + filename);
tarEntry.setSize(byteData.length);
tarOutput.putArchiveEntry(tarEntry);
tarOutput.write(byteData);
tarOutput.closeArchiveEntry();
}
}
}
}
}
reader.close();
} catch (Exception e) {
LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
} finally {
fs.close();
}
}
if (gzip && !warc) {
closeStream();
}
if (!typeCounts.isEmpty()) {
LOG.info("CommonsCrawlDataDumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
}
}
use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project Lucee by lucee.
the class CompressUtil method compressTar.
private static void compressTar(String parent, Resource source, TarArchiveOutputStream tos, int mode) throws IOException {
if (source.isFile()) {
// TarEntry entry = (source instanceof FileResource)?new TarEntry((FileResource)source):new TarEntry(parent);
TarArchiveEntry entry = new TarArchiveEntry(parent);
entry.setName(parent);
// 100777 TODO ist das so ok?
if (mode > 0)
entry.setMode(mode);
else if ((mode = source.getMode()) > 0)
entry.setMode(mode);
entry.setSize(source.length());
entry.setModTime(source.lastModified());
tos.putArchiveEntry(entry);
try {
IOUtil.copy(source, tos, false);
} finally {
tos.closeArchiveEntry();
}
} else if (source.isDirectory()) {
compressTar(parent, source.listResources(), tos, mode);
}
}
use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project knime-core by knime.
the class ExtractTarGz method untar.
private static void untar(final InputStream in, final File destDir) throws IOException {
try (TarArchiveInputStream tarInS = new TarArchiveInputStream(in)) {
TarArchiveEntry entry;
while ((entry = tarInS.getNextTarEntry()) != null) {
String name = entry.getName();
File destFile = new File(destDir, name);
if (entry.isSymbolicLink()) {
Files.createSymbolicLink(destFile.toPath(), Paths.get(entry.getLinkName()));
} else if (entry.isDirectory()) {
destFile.mkdirs();
chmod(destFile, entry.getMode());
} else {
try (FileOutputStream out = new FileOutputStream(destFile)) {
long size = entry.getSize();
IOUtils.copyLarge(tarInS, out, 0, size);
}
chmod(destFile, entry.getMode());
}
}
}
}
use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project AmazeFileManager by TeamAmaze.
the class GzipHelperTask method addElements.
@Override
void addElements(ArrayList<CompressedObjectParcelable> elements) {
TarArchiveInputStream tarInputStream = null;
try {
tarInputStream = new TarArchiveInputStream(new GzipCompressorInputStream(new FileInputStream(filePath)));
TarArchiveEntry entry;
while ((entry = tarInputStream.getNextTarEntry()) != null) {
String name = entry.getName();
if (name.endsWith(SEPARATOR))
name = name.substring(0, name.length() - 1);
boolean isInBaseDir = relativePath.equals("") && !name.contains(SEPARATOR);
boolean isInRelativeDir = name.contains(SEPARATOR) && name.substring(0, name.lastIndexOf(SEPARATOR)).equals(relativePath);
if (isInBaseDir || isInRelativeDir) {
elements.add(new CompressedObjectParcelable(entry.getName(), entry.getLastModifiedDate().getTime(), entry.getSize(), entry.isDirectory()));
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
Aggregations