Search in sources :

Example 91 with TarArchiveEntry

use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project gradle by gradle.

the class CommonsTarPacker method pack.

@Override
public void pack(List<DataSource> inputs, DataTarget output) throws IOException {
    TarArchiveOutputStream tarOutput = new TarArchiveOutputStream(output.openOutput());
    for (DataSource input : inputs) {
        TarArchiveEntry entry = new TarArchiveEntry(input.getName());
        entry.setSize(input.getLength());
        tarOutput.putArchiveEntry(entry);
        PackerUtils.packEntry(input, tarOutput, buffer);
        tarOutput.closeArchiveEntry();
    }
    tarOutput.close();
}
Also used : TarArchiveOutputStream(org.apache.commons.compress.archivers.tar.TarArchiveOutputStream) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry)

Example 92 with TarArchiveEntry

use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project nutch by apache.

the class CommonCrawlDataDumper method dump.

/**
 * Dumps the reverse engineered CBOR content from the provided segment
 * directories if a parent directory contains more than one segment,
 * otherwise a single segment can be passed as an argument. If the boolean
 * argument is provided then the CBOR is also zipped.
 *
 * @param outputDir      the directory you wish to dump the raw content to. This
 *                       directory will be created.
 * @param segmentRootDir a directory containing one or more segments.
 * @param linkdb         Path to linkdb.
 * @param gzip           a boolean flag indicating whether the CBOR content should also
 *                       be gzipped.
 * @param mimeTypes a string array of mimeTypes to filter on, everything else is excluded
 * @param epochFilename  if {@code true}, output files will be names using the epoch time (in milliseconds).
 * @param extension      a file extension to use with output documents.
 * @param warc if true write as warc format
 * @throws Exception if any exception occurs.
 */
public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes, boolean epochFilename, String extension, boolean warc) throws Exception {
    if (gzip) {
        LOG.info("Gzipping CBOR data has been skipped");
    }
    // total file counts
    Map<String, Integer> typeCounts = new HashMap<>();
    // filtered file counters
    Map<String, Integer> filteredCounts = new HashMap<>();
    Configuration nutchConfig = NutchConfiguration.create();
    Path segmentRootPath = new Path(segmentRootDir.toString());
    FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);
    // get all paths
    List<Path> parts = new ArrayList<>();
    RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
    String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}" + File.separator + "data";
    while (files.hasNext()) {
        LocatedFileStatus next = files.next();
        if (next.isFile()) {
            Path path = next.getPath();
            if (path.toString().matches(partPattern)) {
                parts.add(path);
            }
        }
    }
    LinkDbReader linkDbReader = null;
    if (linkdb != null) {
        linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
    }
    if (parts == null || parts.size() == 0) {
        LOG.error("No segment directories found in {} ", segmentRootDir.getAbsolutePath());
        System.exit(1);
    }
    LOG.info("Found {} segment parts", parts.size());
    if (gzip && !warc) {
        fileList = new ArrayList<>();
        constructNewStream(outputDir);
    }
    for (Path segmentPart : parts) {
        LOG.info("Processing segment Part : [ {} ]", segmentPart);
        try {
            SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig, SequenceFile.Reader.file(segmentPart));
            Writable key = (Writable) reader.getKeyClass().getConstructor().newInstance();
            Content content = null;
            while (reader.next(key)) {
                content = new Content();
                reader.getCurrentValue(content);
                Metadata metadata = content.getMetadata();
                String url = key.toString();
                String baseName = FilenameUtils.getBaseName(url);
                String extensionName = FilenameUtils.getExtension(url);
                if (!extension.isEmpty()) {
                    extensionName = extension;
                } else if ((extensionName == null) || extensionName.isEmpty()) {
                    extensionName = "html";
                }
                String outputFullPath = null;
                String outputRelativePath = null;
                String filename = null;
                String timestamp = null;
                String reverseKey = null;
                if (epochFilename || config.getReverseKey()) {
                    try {
                        long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(getDate(metadata.get("Date"))).getTime();
                        timestamp = String.valueOf(epoch);
                    } catch (ParseException pe) {
                        LOG.warn(pe.getMessage());
                    }
                    reverseKey = reverseUrl(url);
                    config.setReverseKeyValue(reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url) + "_" + timestamp);
                }
                if (!warc) {
                    if (epochFilename) {
                        outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey, url, timestamp, extensionName, !gzip);
                        outputRelativePath = outputFullPath.substring(0, outputFullPath.lastIndexOf(File.separator) - 1);
                        filename = content.getMetadata().get(Metadata.DATE) + "." + extensionName;
                    } else {
                        String md5Ofurl = DumpFileUtil.getUrlMD5(url);
                        String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, !gzip);
                        filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
                        outputFullPath = String.format("%s/%s", fullDir, filename);
                        String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
                        String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2];
                        String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1];
                        outputRelativePath = firstLevelDirName + secondLevelDirName;
                    }
                }
                // Encode all filetypes if no mimetypes have been given
                Boolean filter = (mimeTypes == null);
                String jsonData = "";
                try {
                    String mimeType = new Tika().detect(content.getContent());
                    // Maps file to JSON-based structure
                    // there may be duplicates, so using set
                    Set<String> inUrls = null;
                    if (linkDbReader != null) {
                        Inlinks inlinks = linkDbReader.getInlinks((Text) key);
                        if (inlinks != null) {
                            Iterator<Inlink> iterator = inlinks.iterator();
                            inUrls = new LinkedHashSet<>();
                            while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()) {
                                inUrls.add(iterator.next().getFromUrl());
                            }
                        }
                    }
                    // TODO: Make this Jackson Format implementation reusable
                    try (CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
                        if (inUrls != null) {
                            format.setInLinks(new ArrayList<>(inUrls));
                        }
                        jsonData = format.getJsonData(url, content, metadata);
                    }
                    collectStats(typeCounts, mimeType);
                    // collects statistics for the given mimetypes
                    if ((mimeType != null) && (mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) {
                        collectStats(filteredCounts, mimeType);
                        filter = true;
                    }
                } catch (IOException ioe) {
                    LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
                    return;
                }
                if (!warc) {
                    if (filter) {
                        byte[] byteData = serializeCBORData(jsonData);
                        if (!gzip) {
                            File outputFile = new File(outputFullPath);
                            if (outputFile.exists()) {
                                LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                            } else {
                                LOG.info("Writing: [" + outputFullPath + "]");
                                IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
                            }
                        } else {
                            if (fileList.contains(outputFullPath)) {
                                LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
                            } else {
                                fileList.add(outputFullPath);
                                LOG.info("Compressing: [" + outputFullPath + "]");
                                // TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
                                TarArchiveEntry tarEntry = new TarArchiveEntry(outputRelativePath + File.separator + filename);
                                tarEntry.setSize(byteData.length);
                                tarOutput.putArchiveEntry(tarEntry);
                                tarOutput.write(byteData);
                                tarOutput.closeArchiveEntry();
                            }
                        }
                    }
                }
            }
            reader.close();
        } catch (Exception e) {
            LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
        } finally {
            fs.close();
        }
    }
    if (gzip && !warc) {
        closeStream();
    }
    if (!typeCounts.isEmpty()) {
        LOG.info("CommonsCrawlDataDumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Metadata(org.apache.nutch.metadata.Metadata) LinkDbReader(org.apache.nutch.crawl.LinkDbReader) Writable(org.apache.hadoop.io.Writable) LinkDbReader(org.apache.nutch.crawl.LinkDbReader) Tika(org.apache.tika.Tika) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) Path(org.apache.hadoop.fs.Path) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Inlinks(org.apache.nutch.crawl.Inlinks) IOException(java.io.IOException) Inlink(org.apache.nutch.crawl.Inlink) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) ParseException(java.text.ParseException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ByteArrayInputStream(java.io.ByteArrayInputStream) Content(org.apache.nutch.protocol.Content) FileOutputStream(java.io.FileOutputStream) ParseException(java.text.ParseException) SimpleDateFormat(com.ibm.icu.text.SimpleDateFormat) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Example 93 with TarArchiveEntry

use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project winery by eclipse.

the class ScriptPlugin method addFilesToTarGZ.

private static void addFilesToTarGZ(final File file, final TarArchiveOutputStream tos) throws IOException {
    // New TarArchiveEntry
    tos.putArchiveEntry(new TarArchiveEntry(file, file.getName()));
    File[] files = file.listFiles();
    if (file.isDirectory() && files != null) {
        // no need to copy any content since it is a directory, just close the output stream
        tos.closeArchiveEntry();
        for (final File cFile : files) {
            // recursively call the method for all the subfolders
            addFilesToTarGZ(cFile, tos);
        }
    } else {
        try (FileInputStream fis = new FileInputStream(file);
            BufferedInputStream bis = new BufferedInputStream(fis)) {
            IOUtils.copy(bis, tos);
            tos.closeArchiveEntry();
        }
    }
}
Also used : BufferedInputStream(java.io.BufferedInputStream) File(java.io.File) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) FileInputStream(java.io.FileInputStream)

Example 94 with TarArchiveEntry

use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project java by kubernetes-client.

the class Copy method copyFileToPodAsync.

public Future<Integer> copyFileToPodAsync(String namespace, String pod, String container, byte[] src, Path destPath) throws ApiException, IOException {
    // Run decoding and extracting processes
    final Process proc = execCopyToPod(namespace, pod, container, destPath);
    try (ArchiveOutputStream archiveOutputStream = new TarArchiveOutputStream(proc.getOutputStream())) {
        ArchiveEntry tarEntry = new TarArchiveEntry(new File(destPath.getFileName().toString()));
        ((TarArchiveEntry) tarEntry).setSize(src.length);
        archiveOutputStream.putArchiveEntry(tarEntry);
        Streams.copy(new ByteArrayInputStream(src), archiveOutputStream);
        archiveOutputStream.closeArchiveEntry();
        return new ProcessFuture(proc);
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) ArchiveEntry(org.apache.commons.compress.archivers.ArchiveEntry) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) TarArchiveOutputStream(org.apache.commons.compress.archivers.tar.TarArchiveOutputStream) File(java.io.File) TarArchiveOutputStream(org.apache.commons.compress.archivers.tar.TarArchiveOutputStream) ArchiveOutputStream(org.apache.commons.compress.archivers.ArchiveOutputStream) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry)

Example 95 with TarArchiveEntry

use of org.apache.commons.compress.archivers.tar.TarArchiveEntry in project java by kubernetes-client.

the class Copy method copyFileToPodAsync.

public Future<Integer> copyFileToPodAsync(String namespace, String pod, String container, Path srcPath, Path destPath) throws ApiException, IOException {
    // Run decoding and extracting processes
    final Process proc = execCopyToPod(namespace, pod, container, destPath);
    // Send encoded archive output stream
    File srcFile = new File(srcPath.toUri());
    try (ArchiveOutputStream archiveOutputStream = new TarArchiveOutputStream(proc.getOutputStream());
        FileInputStream input = new FileInputStream(srcFile)) {
        ArchiveEntry tarEntry = new TarArchiveEntry(srcFile, destPath.getFileName().toString());
        archiveOutputStream.putArchiveEntry(tarEntry);
        Streams.copy(input, archiveOutputStream);
        archiveOutputStream.closeArchiveEntry();
        return new ProcessFuture(proc);
    }
}
Also used : ArchiveEntry(org.apache.commons.compress.archivers.ArchiveEntry) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) TarArchiveOutputStream(org.apache.commons.compress.archivers.tar.TarArchiveOutputStream) File(java.io.File) TarArchiveOutputStream(org.apache.commons.compress.archivers.tar.TarArchiveOutputStream) ArchiveOutputStream(org.apache.commons.compress.archivers.ArchiveOutputStream) FileInputStream(java.io.FileInputStream) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry)

Aggregations

TarArchiveEntry (org.apache.commons.compress.archivers.tar.TarArchiveEntry)213 TarArchiveInputStream (org.apache.commons.compress.archivers.tar.TarArchiveInputStream)102 File (java.io.File)91 FileInputStream (java.io.FileInputStream)59 IOException (java.io.IOException)59 FileOutputStream (java.io.FileOutputStream)46 GzipCompressorInputStream (org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream)40 InputStream (java.io.InputStream)32 TarArchiveOutputStream (org.apache.commons.compress.archivers.tar.TarArchiveOutputStream)32 BufferedInputStream (java.io.BufferedInputStream)31 ByteArrayInputStream (java.io.ByteArrayInputStream)28 ByteArrayOutputStream (java.io.ByteArrayOutputStream)24 Test (org.junit.Test)24 Path (java.nio.file.Path)21 BufferedOutputStream (java.io.BufferedOutputStream)20 OutputStream (java.io.OutputStream)18 ArrayList (java.util.ArrayList)18 ArchiveStreamFactory (org.apache.commons.compress.archivers.ArchiveStreamFactory)16 HashMap (java.util.HashMap)12 GZIPInputStream (java.util.zip.GZIPInputStream)12