Search in sources :

Example 21 with TarArchiveEntry

use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project ignite by apache.

the class IgniteHadoopTestSuite method download.

/**
 *  Downloads and extracts an Apache product.
 *
 * @param appName Name of application for log messages.
 * @param homeVariable Pointer to home directory of the component.
 * @param downloadPath Relative download path of tar package.
 * @param destName Local directory name to install component.
 * @throws Exception If failed.
 */
private static void download(String appName, String homeVariable, String downloadPath, String destName) throws Exception {
    String homeVal = IgniteSystemProperties.getString(homeVariable);
    if (!F.isEmpty(homeVal) && new File(homeVal).isDirectory()) {
        X.println(homeVariable + " is set to: " + homeVal);
        return;
    }
    List<String> urls = F.asList("http://archive.apache.org/dist/", "http://apache-mirror.rbc.ru/pub/apache/", "http://www.eu.apache.org/dist/", "http://www.us.apache.org/dist/");
    String tmpPath = System.getProperty("java.io.tmpdir");
    X.println("tmp: " + tmpPath);
    final File install = new File(tmpPath + File.separatorChar + "__hadoop");
    final File home = new File(install, destName);
    X.println("Setting " + homeVariable + " to " + home.getAbsolutePath());
    System.setProperty(homeVariable, home.getAbsolutePath());
    final File successFile = new File(home, "__success");
    if (home.exists()) {
        if (successFile.exists()) {
            X.println(appName + " distribution already exists.");
            return;
        }
        X.println(appName + " distribution is invalid and it will be deleted.");
        if (!U.delete(home))
            throw new IOException("Failed to delete directory: " + home.getAbsolutePath());
    }
    for (String url : urls) {
        if (!(install.exists() || install.mkdirs()))
            throw new IOException("Failed to create directory: " + install.getAbsolutePath());
        URL u = new URL(url + downloadPath);
        X.println("Attempting to download from: " + u);
        try {
            URLConnection c = u.openConnection();
            c.connect();
            try (TarArchiveInputStream in = new TarArchiveInputStream(new GzipCompressorInputStream(new BufferedInputStream(c.getInputStream(), 32 * 1024)))) {
                TarArchiveEntry entry;
                while ((entry = in.getNextTarEntry()) != null) {
                    File dest = new File(install, entry.getName());
                    if (entry.isDirectory()) {
                        if (!dest.mkdirs())
                            throw new IllegalStateException();
                    } else if (entry.isSymbolicLink()) {
                        // Important: in Hadoop installation there are symlinks, we need to create them:
                        Path theLinkItself = Paths.get(install.getAbsolutePath(), entry.getName());
                        Path linkTarget = Paths.get(entry.getLinkName());
                        Files.createSymbolicLink(theLinkItself, linkTarget);
                    } else {
                        File parent = dest.getParentFile();
                        if (!(parent.exists() || parent.mkdirs()))
                            throw new IllegalStateException();
                        X.print(" [" + dest);
                        try (BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(dest, false), 128 * 1024)) {
                            U.copy(in, out);
                            out.flush();
                        }
                        Files.setPosixFilePermissions(dest.toPath(), modeToPermissionSet(entry.getMode()));
                        X.println("]");
                    }
                }
            }
            if (successFile.createNewFile())
                return;
        } catch (Exception e) {
            e.printStackTrace();
            U.delete(home);
        }
    }
    throw new IllegalStateException("Failed to install " + appName + ".");
}
Also used : GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) Path(java.nio.file.Path) IOException(java.io.IOException) URL(java.net.URL) URLConnection(java.net.URLConnection) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) IOException(java.io.IOException) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) BufferedInputStream(java.io.BufferedInputStream) FileOutputStream(java.io.FileOutputStream) File(java.io.File) BufferedOutputStream(java.io.BufferedOutputStream)

Example 22 with TarArchiveEntry

use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project nutch by apache.

the class CommonCrawlDataDumper method dump.

/**
 * Dumps the reverse engineered CBOR content from the provided segment
 * directories if a parent directory contains more than one segment,
 * otherwise a single segment can be passed as an argument. If the boolean
 * argument is provided then the CBOR is also zipped.
 *
 * @param outputDir      the directory you wish to dump the raw content to. This
 *                       directory will be created.
 * @param segmentRootDir a directory containing one or more segments.
 * @param linkdb         Path to linkdb.
 * @param gzip           a boolean flag indicating whether the CBOR content should also
 *                       be gzipped.
 * @param epochFilename  if {@code true}, output files will be names using the epoch time (in milliseconds).
 * @param extension      a file extension to use with output documents.
 * @throws Exception if any exception occurs.
 */
public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes, boolean epochFilename, String extension, boolean warc) throws Exception {
    if (gzip) {
        LOG.info("Gzipping CBOR data has been skipped");
    }
    // total file counts
    Map<String, Integer> typeCounts = new HashMap<>();
    // filtered file counters
    Map<String, Integer> filteredCounts = new HashMap<>();
    Configuration nutchConfig = NutchConfiguration.create();
    Path segmentRootPath = new Path(segmentRootDir.toString());
    FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);
    // get all paths
    List<Path> parts = new ArrayList<>();
    RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
    String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}" + File.separator + "data";
    while (files.hasNext()) {
        LocatedFileStatus next = files.next();
        if (next.isFile()) {
            Path path = next.getPath();
            if (path.toString().matches(partPattern)) {
                parts.add(path);
            }
        }
    }
    LinkDbReader linkDbReader = null;
    if (linkdb != null) {
        linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
    }
    if (parts == null || parts.size() == 0) {
        LOG.error("No segment directories found in {} ", segmentRootDir.getAbsolutePath());
        System.exit(1);
    }
    LOG.info("Found {} segment parts", parts.size());
    if (gzip && !warc) {
        fileList = new ArrayList<>();
        constructNewStream(outputDir);
    }
    for (Path segmentPart : parts) {
        LOG.info("Processing segment Part : [ {} ]", segmentPart);
        try {
            SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig, SequenceFile.Reader.file(segmentPart));
            Writable key = (Writable) reader.getKeyClass().newInstance();
            Content content = null;
            while (reader.next(key)) {
                content = new Content();
                reader.getCurrentValue(content);
                Metadata metadata = content.getMetadata();
                String url = key.toString();
                String baseName = FilenameUtils.getBaseName(url);
                String extensionName = FilenameUtils.getExtension(url);
                if (!extension.isEmpty()) {
                    extensionName = extension;
                } else if ((extensionName == null) || extensionName.isEmpty()) {
                    extensionName = "html";
                }
                String outputFullPath = null;
                String outputRelativePath = null;
                String filename = null;
                String timestamp = null;
                String reverseKey = null;
                if (epochFilename || config.getReverseKey()) {
                    try {
                        long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(getDate(metadata.get("Date"))).getTime();
                        timestamp = String.valueOf(epoch);
                    } catch (ParseException pe) {
                        LOG.warn(pe.getMessage());
                    }
                    reverseKey = reverseUrl(url);
                    config.setReverseKeyValue(reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url) + "_" + timestamp);
                }
                if (!warc) {
                    if (epochFilename) {
                        outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey, url, timestamp, extensionName, !gzip);
                        outputRelativePath = outputFullPath.substring(0, outputFullPath.lastIndexOf(File.separator) - 1);
                        filename = content.getMetadata().get(Metadata.DATE) + "." + extensionName;
                    } else {
                        String md5Ofurl = DumpFileUtil.getUrlMD5(url);
                        String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, !gzip);
                        filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
                        outputFullPath = String.format("%s/%s", fullDir, filename);
                        String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
                        String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2];
                        String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1];
                        outputRelativePath = firstLevelDirName + secondLevelDirName;
                    }
                }
                // Encode all filetypes if no mimetypes have been given
                Boolean filter = (mimeTypes == null);
                String jsonData = "";
                try {
                    String mimeType = new Tika().detect(content.getContent());
                    // Maps file to JSON-based structure
                    // there may be duplicates, so using set
                    Set<String> inUrls = null;
                    if (linkDbReader != null) {
                        Inlinks inlinks = linkDbReader.getInlinks((Text) key);
                        if (inlinks != null) {
                            Iterator<Inlink> iterator = inlinks.iterator();
                            inUrls = new LinkedHashSet<>();
                            while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()) {
                                inUrls.add(iterator.next().getFromUrl());
                            }
                        }
                    }
                    // TODO: Make this Jackson Format implementation reusable
                    try (CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
                        if (inUrls != null) {
                            format.setInLinks(new ArrayList<>(inUrls));
                        }
                        jsonData = format.getJsonData(url, content, metadata);
                    }
                    collectStats(typeCounts, mimeType);
                    // collects statistics for the given mimetypes
                    if ((mimeType != null) && (mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) {
                        collectStats(filteredCounts, mimeType);
                        filter = true;
                    }
                } catch (IOException ioe) {
                    LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
                    return;
                }
                if (!warc) {
                    if (filter) {
                        byte[] byteData = serializeCBORData(jsonData);
                        if (!gzip) {
                            File outputFile = new File(outputFullPath);
                            if (outputFile.exists()) {
                                LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                            } else {
                                LOG.info("Writing: [" + outputFullPath + "]");
                                IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
                            }
                        } else {
                            if (fileList.contains(outputFullPath)) {
                                LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
                            } else {
                                fileList.add(outputFullPath);
                                LOG.info("Compressing: [" + outputFullPath + "]");
                                // TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
                                TarArchiveEntry tarEntry = new TarArchiveEntry(outputRelativePath + File.separator + filename);
                                tarEntry.setSize(byteData.length);
                                tarOutput.putArchiveEntry(tarEntry);
                                tarOutput.write(byteData);
                                tarOutput.closeArchiveEntry();
                            }
                        }
                    }
                }
            }
            reader.close();
        } catch (Exception e) {
            LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
        } finally {
            fs.close();
        }
    }
    if (gzip && !warc) {
        closeStream();
    }
    if (!typeCounts.isEmpty()) {
        LOG.info("CommonsCrawlDataDumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Metadata(org.apache.nutch.metadata.Metadata) LinkDbReader(org.apache.nutch.crawl.LinkDbReader) Writable(org.apache.hadoop.io.Writable) LinkDbReader(org.apache.nutch.crawl.LinkDbReader) Tika(org.apache.tika.Tika) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) Path(org.apache.hadoop.fs.Path) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Inlinks(org.apache.nutch.crawl.Inlinks) IOException(java.io.IOException) Inlink(org.apache.nutch.crawl.Inlink) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) ParseException(java.text.ParseException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ByteArrayInputStream(java.io.ByteArrayInputStream) Content(org.apache.nutch.protocol.Content) FileOutputStream(java.io.FileOutputStream) ParseException(java.text.ParseException) SimpleDateFormat(com.ibm.icu.text.SimpleDateFormat) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Example 23 with TarArchiveEntry

use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project Lucee by lucee.

the class CompressUtil method compressTar.

private static void compressTar(String parent, Resource source, TarArchiveOutputStream tos, int mode) throws IOException {
    if (source.isFile()) {
        // TarEntry entry = (source instanceof FileResource)?new TarEntry((FileResource)source):new TarEntry(parent);
        TarArchiveEntry entry = new TarArchiveEntry(parent);
        entry.setName(parent);
        // 100777 TODO ist das so ok?
        if (mode > 0)
            entry.setMode(mode);
        else if ((mode = source.getMode()) > 0)
            entry.setMode(mode);
        entry.setSize(source.length());
        entry.setModTime(source.lastModified());
        tos.putArchiveEntry(entry);
        try {
            IOUtil.copy(source, tos, false);
        } finally {
            tos.closeArchiveEntry();
        }
    } else if (source.isDirectory()) {
        compressTar(parent, source.listResources(), tos, mode);
    }
}
Also used : TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry)

Example 24 with TarArchiveEntry

use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project knime-core by knime.

the class ExtractTarGz method untar.

private static void untar(final InputStream in, final File destDir) throws IOException {
    try (TarArchiveInputStream tarInS = new TarArchiveInputStream(in)) {
        TarArchiveEntry entry;
        while ((entry = tarInS.getNextTarEntry()) != null) {
            String name = entry.getName();
            File destFile = new File(destDir, name);
            if (entry.isSymbolicLink()) {
                Files.createSymbolicLink(destFile.toPath(), Paths.get(entry.getLinkName()));
            } else if (entry.isDirectory()) {
                destFile.mkdirs();
                chmod(destFile, entry.getMode());
            } else {
                try (FileOutputStream out = new FileOutputStream(destFile)) {
                    long size = entry.getSize();
                    IOUtils.copyLarge(tarInS, out, 0, size);
                }
                chmod(destFile, entry.getMode());
            }
        }
    }
}
Also used : TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) FileOutputStream(java.io.FileOutputStream) File(java.io.File) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry)

Example 25 with TarArchiveEntry

use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project AmazeFileManager by TeamAmaze.

the class GzipHelperTask method addElements.

@Override
void addElements(ArrayList<CompressedObjectParcelable> elements) {
    TarArchiveInputStream tarInputStream = null;
    try {
        tarInputStream = new TarArchiveInputStream(new GzipCompressorInputStream(new FileInputStream(filePath)));
        TarArchiveEntry entry;
        while ((entry = tarInputStream.getNextTarEntry()) != null) {
            String name = entry.getName();
            if (name.endsWith(SEPARATOR))
                name = name.substring(0, name.length() - 1);
            boolean isInBaseDir = relativePath.equals("") && !name.contains(SEPARATOR);
            boolean isInRelativeDir = name.contains(SEPARATOR) && name.substring(0, name.lastIndexOf(SEPARATOR)).equals(relativePath);
            if (isInBaseDir || isInRelativeDir) {
                elements.add(new CompressedObjectParcelable(entry.getName(), entry.getLastModifiedDate().getTime(), entry.getSize(), entry.isDirectory()));
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) CompressedObjectParcelable(com.amaze.filemanager.adapters.data.CompressedObjectParcelable) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry)

Aggregations

TarArchiveEntry (org.apache.commons.compress.archivers.tar.TarArchiveEntry)200 TarArchiveInputStream (org.apache.commons.compress.archivers.tar.TarArchiveInputStream)99 File (java.io.File)86 FileInputStream (java.io.FileInputStream)55 IOException (java.io.IOException)55 FileOutputStream (java.io.FileOutputStream)45 GzipCompressorInputStream (org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream)38 InputStream (java.io.InputStream)31 BufferedInputStream (java.io.BufferedInputStream)29 TarArchiveOutputStream (org.apache.commons.compress.archivers.tar.TarArchiveOutputStream)25 ByteArrayInputStream (java.io.ByteArrayInputStream)22 Path (java.nio.file.Path)21 BufferedOutputStream (java.io.BufferedOutputStream)20 ByteArrayOutputStream (java.io.ByteArrayOutputStream)20 Test (org.junit.Test)20 ArrayList (java.util.ArrayList)18 OutputStream (java.io.OutputStream)17 ArchiveStreamFactory (org.apache.commons.compress.archivers.ArchiveStreamFactory)15 HashMap (java.util.HashMap)11 GZIPInputStream (java.util.zip.GZIPInputStream)11