Search in sources :

Example 1 with ExternalFile

use of org.apache.asterix.external.indexing.ExternalFile in project asterixdb by apache.

the class ExternalIndexingOperations method getSnapshotFromExternalFileSystem.

public static List<ExternalFile> getSnapshotFromExternalFileSystem(Dataset dataset) throws AlgebricksException {
    ArrayList<ExternalFile> files = new ArrayList<>();
    ExternalDatasetDetails datasetDetails = (ExternalDatasetDetails) dataset.getDatasetDetails();
    try {
        // Create the file system object
        FileSystem fs = getFileSystemObject(datasetDetails.getProperties());
        // Get paths of dataset
        String path = datasetDetails.getProperties().get(ExternalDataConstants.KEY_PATH);
        String[] paths = path.split(",");
        // Add fileStatuses to files
        for (String aPath : paths) {
            FileStatus[] fileStatuses = fs.listStatus(new Path(aPath));
            for (int i = 0; i < fileStatuses.length; i++) {
                int nextFileNumber = files.size();
                handleFile(dataset, files, fs, fileStatuses[i], nextFileNumber);
            }
        }
        // Close file system
        fs.close();
        if (files.isEmpty()) {
            throw new AlgebricksException("File Snapshot retrieved from external file system is empty");
        }
        return files;
    } catch (Exception e) {
        LOGGER.log(Level.WARNING, "Exception while trying to get snapshot from external system", e);
        throw new AlgebricksException("Unable to get list of HDFS files " + e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) AlgebricksException(org.apache.hyracks.algebricks.common.exceptions.AlgebricksException) ExternalFile(org.apache.asterix.external.indexing.ExternalFile) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) AlgebricksException(org.apache.hyracks.algebricks.common.exceptions.AlgebricksException) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException) IOException(java.io.IOException) ExternalDatasetDetails(org.apache.asterix.metadata.entities.ExternalDatasetDetails) FileSystem(org.apache.hadoop.fs.FileSystem) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem)

Example 2 with ExternalFile

use of org.apache.asterix.external.indexing.ExternalFile in project asterixdb by apache.

the class ExternalIndexingOperations method isDatasetUptodate.

/**
     * At the end of this method, we expect to have 4 sets as follows:
     * metadataFiles should contain only the files that are appended in their original state
     * addedFiles should contain new files that has number assigned starting after the max original file number
     * deletedFiles should contain files that are no longer there in the file system
     * appendedFiles should have the new file information of existing files
     * The method should return false in case of zero delta
     *
     * @param dataset
     * @param metadataFiles
     * @param addedFiles
     * @param deletedFiles
     * @param appendedFiles
     * @return
     * @throws AlgebricksException
     */
public static boolean isDatasetUptodate(Dataset dataset, List<ExternalFile> metadataFiles, List<ExternalFile> addedFiles, List<ExternalFile> deletedFiles, List<ExternalFile> appendedFiles) throws AlgebricksException {
    boolean uptodate = true;
    int newFileNumber = metadataFiles.get(metadataFiles.size() - 1).getFileNumber() + 1;
    List<ExternalFile> fileSystemFiles = getSnapshotFromExternalFileSystem(dataset);
    // Loop over file system files < taking care of added files >
    for (ExternalFile fileSystemFile : fileSystemFiles) {
        boolean fileFound = false;
        Iterator<ExternalFile> mdFilesIterator = metadataFiles.iterator();
        while (mdFilesIterator.hasNext()) {
            ExternalFile metadataFile = mdFilesIterator.next();
            if (!fileSystemFile.getFileName().equals(metadataFile.getFileName())) {
                continue;
            }
            // Same file name
            if (fileSystemFile.getLastModefiedTime().equals(metadataFile.getLastModefiedTime())) {
                // Same timestamp
                if (fileSystemFile.getSize() == metadataFile.getSize()) {
                    // Same size -> no op
                    mdFilesIterator.remove();
                    fileFound = true;
                } else {
                    // Different size -> append op
                    metadataFile.setPendingOp(ExternalFilePendingOp.APPEND_OP);
                    fileSystemFile.setPendingOp(ExternalFilePendingOp.APPEND_OP);
                    appendedFiles.add(fileSystemFile);
                    fileFound = true;
                    uptodate = false;
                }
            } else {
                // Same file name, Different file mod date -> delete and add
                metadataFile.setPendingOp(ExternalFilePendingOp.DROP_OP);
                deletedFiles.add(new ExternalFile(metadataFile.getDataverseName(), metadataFile.getDatasetName(), 0, metadataFile.getFileName(), metadataFile.getLastModefiedTime(), metadataFile.getSize(), ExternalFilePendingOp.DROP_OP));
                fileSystemFile.setPendingOp(ExternalFilePendingOp.ADD_OP);
                fileSystemFile.setFileNumber(newFileNumber);
                addedFiles.add(fileSystemFile);
                newFileNumber++;
                fileFound = true;
                uptodate = false;
            }
            if (fileFound) {
                break;
            }
        }
        if (!fileFound) {
            // File not stored previously in metadata -> pending add op
            fileSystemFile.setPendingOp(ExternalFilePendingOp.ADD_OP);
            fileSystemFile.setFileNumber(newFileNumber);
            addedFiles.add(fileSystemFile);
            newFileNumber++;
            uptodate = false;
        }
    }
    // first, correct number assignment to deleted and updated files
    for (ExternalFile deletedFile : deletedFiles) {
        deletedFile.setFileNumber(newFileNumber);
        newFileNumber++;
    }
    for (ExternalFile appendedFile : appendedFiles) {
        appendedFile.setFileNumber(newFileNumber);
        newFileNumber++;
    }
    // include the remaining deleted files
    Iterator<ExternalFile> mdFilesIterator = metadataFiles.iterator();
    while (mdFilesIterator.hasNext()) {
        ExternalFile metadataFile = mdFilesIterator.next();
        if (metadataFile.getPendingOp() == ExternalFilePendingOp.NO_OP) {
            metadataFile.setPendingOp(ExternalFilePendingOp.DROP_OP);
            deletedFiles.add(new ExternalFile(metadataFile.getDataverseName(), metadataFile.getDatasetName(), newFileNumber, metadataFile.getFileName(), metadataFile.getLastModefiedTime(), metadataFile.getSize(), metadataFile.getPendingOp()));
            newFileNumber++;
            uptodate = false;
        }
    }
    return uptodate;
}
Also used : AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) ExternalFile(org.apache.asterix.external.indexing.ExternalFile)

Example 3 with ExternalFile

use of org.apache.asterix.external.indexing.ExternalFile in project asterixdb by apache.

the class ExternalIndexingOperations method listSubFiles.

/* list all files under the directory
     * src is expected to be a folder
     */
private static void listSubFiles(Dataset dataset, FileSystem srcFs, FileStatus src, List<ExternalFile> files) throws IOException {
    Path path = src.getPath();
    FileStatus[] fileStatuses = srcFs.listStatus(path);
    for (int i = 0; i < fileStatuses.length; i++) {
        int nextFileNumber = files.size();
        if (fileStatuses[i].isDirectory()) {
            listSubFiles(dataset, srcFs, fileStatuses[i], files);
        } else {
            files.add(new ExternalFile(dataset.getDataverseName(), dataset.getDatasetName(), nextFileNumber, fileStatuses[i].getPath().toUri().getPath(), new Date(fileStatuses[i].getModificationTime()), fileStatuses[i].getLen(), ExternalFilePendingOp.NO_OP));
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) ExternalFile(org.apache.asterix.external.indexing.ExternalFile) Date(java.util.Date)

Example 4 with ExternalFile

use of org.apache.asterix.external.indexing.ExternalFile in project asterixdb by apache.

the class ExternalIndexingOperations method buildIndexUpdateOp.

public static JobSpecification buildIndexUpdateOp(Dataset ds, Index index, List<ExternalFile> metadataFiles, List<ExternalFile> addedFiles, List<ExternalFile> appendedFiles, MetadataProvider metadataProvider) throws AlgebricksException {
    // Create files list
    ArrayList<ExternalFile> files = new ArrayList<>();
    for (ExternalFile metadataFile : metadataFiles) {
        if (metadataFile.getPendingOp() != ExternalFilePendingOp.APPEND_OP) {
            files.add(metadataFile);
        } else {
            metadataFile.setPendingOp(ExternalFilePendingOp.NO_OP);
            files.add(metadataFile);
        }
    }
    // add new files
    for (ExternalFile file : addedFiles) {
        files.add(file);
    }
    // add appended files
    for (ExternalFile file : appendedFiles) {
        files.add(file);
    }
    return IndexUtil.buildSecondaryIndexLoadingJobSpec(ds, index, metadataProvider, files);
}
Also used : ArrayList(java.util.ArrayList) ExternalFile(org.apache.asterix.external.indexing.ExternalFile)

Example 5 with ExternalFile

use of org.apache.asterix.external.indexing.ExternalFile in project asterixdb by apache.

the class ExternalFileTupleTranslator method createExternalFileFromARecord.

private ExternalFile createExternalFileFromARecord(ARecord externalFileRecord) {
    String dataverseName = ((AString) externalFileRecord.getValueByPos(MetadataRecordTypes.EXTERNAL_FILE_ARECORD_DATAVERSENAME_FIELD_INDEX)).getStringValue();
    String datasetName = ((AString) externalFileRecord.getValueByPos(MetadataRecordTypes.EXTERNAL_FILE_ARECORD_DATASET_NAME_FIELD_INDEX)).getStringValue();
    int fileNumber = ((AInt32) externalFileRecord.getValueByPos(MetadataRecordTypes.EXTERNAL_FILE_ARECORD_FILE_NUMBER_FIELD_INDEX)).getIntegerValue();
    String fileName = ((AString) externalFileRecord.getValueByPos(MetadataRecordTypes.EXTERNAL_FILE_ARECORD_FILE_NAME_FIELD_INDEX)).getStringValue();
    long fileSize = ((AInt64) externalFileRecord.getValueByPos(MetadataRecordTypes.EXTERNAL_FILE_ARECORD_FILE_SIZE_FIELD_INDEX)).getLongValue();
    Date lastMoDifiedDate = new Date(((ADateTime) externalFileRecord.getValueByPos(MetadataRecordTypes.EXTERNAL_FILE_ARECORD_FILE_MOD_DATE_FIELD_INDEX)).getChrononTime());
    ExternalFilePendingOp pendingOp = ExternalFilePendingOp.values()[((AInt32) externalFileRecord.getValueByPos(MetadataRecordTypes.EXTERNAL_FILE_ARECORD_FILE_PENDING_OP_FIELD_INDEX)).getIntegerValue()];
    return new ExternalFile(dataverseName, datasetName, fileNumber, fileName, lastMoDifiedDate, fileSize, pendingOp);
}
Also used : ExternalFilePendingOp(org.apache.asterix.common.config.DatasetConfig.ExternalFilePendingOp) AString(org.apache.asterix.om.base.AString) AString(org.apache.asterix.om.base.AString) AInt32(org.apache.asterix.om.base.AInt32) Date(java.util.Date) ExternalFile(org.apache.asterix.external.indexing.ExternalFile) AInt64(org.apache.asterix.om.base.AInt64)

Aggregations

ExternalFile (org.apache.asterix.external.indexing.ExternalFile)17 ArrayList (java.util.ArrayList)7 HyracksDataException (org.apache.hyracks.api.exceptions.HyracksDataException)7 Index (org.apache.asterix.metadata.entities.Index)5 IOException (java.io.IOException)4 Date (java.util.Date)4 Dataset (org.apache.asterix.metadata.entities.Dataset)4 ExternalDatasetDetails (org.apache.asterix.metadata.entities.ExternalDatasetDetails)4 AlgebricksPartitionConstraint (org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint)4 AlgebricksException (org.apache.hyracks.algebricks.common.exceptions.AlgebricksException)4 ACIDException (org.apache.asterix.common.exceptions.ACIDException)3 AsterixException (org.apache.asterix.common.exceptions.AsterixException)3 CompilationException (org.apache.asterix.common.exceptions.CompilationException)3 MetadataException (org.apache.asterix.metadata.MetadataException)3 FileStatus (org.apache.hadoop.fs.FileStatus)3 Path (org.apache.hadoop.fs.Path)3 JobSpecification (org.apache.hyracks.api.job.JobSpecification)3 IIndex (org.apache.hyracks.storage.common.IIndex)3 RemoteException (java.rmi.RemoteException)2 List (java.util.List)2