Search in sources :

Example 1 with FSFileInfo

use of bio.terra.service.filedata.FSFileInfo in project jade-data-repo by DataBiosphere.

the class IngestDriverStep method getLoadCandidates.

private LoadCandidates getLoadCandidates(FlightContext context, UUID loadId, int concurrentLoads) throws DatabaseOperationException, InterruptedException {
    // We start by getting the database view of the state of loads.
    // For the running loads, we ask Stairway what the actual state is.
    // If they have completed, we mark them as such.
    // We then update the failure count and runnings loads list in the
    // LoadCandidates so it correctly reflects the running state
    // right now (more or less).
    LoadCandidates candidates = loadService.findCandidates(loadId, concurrentLoads);
    logger.debug("Candidates from db: failedLoads={}  runningLoads={}  candidateFiles={}", candidates.getFailedLoads(), candidates.getRunningLoads().size(), candidates.getCandidateFiles().size());
    int failureCount = candidates.getFailedLoads();
    List<LoadFile> realRunningLoads = new LinkedList<>();
    for (LoadFile loadFile : candidates.getRunningLoads()) {
        FlightState flightState = context.getStairway().getFlightState(loadFile.getFlightId());
        switch(flightState.getFlightStatus()) {
            case RUNNING:
            case WAITING:
            case READY:
            case QUEUED:
                realRunningLoads.add(loadFile);
                break;
            case ERROR:
            case FATAL:
                {
                    String error = "unknown error";
                    if (flightState.getException().isPresent()) {
                        error = flightState.getException().get().toString();
                    }
                    loadService.setLoadFileFailed(loadId, loadFile.getTargetPath(), error);
                    failureCount++;
                    break;
                }
            case SUCCESS:
                {
                    FlightMap resultMap = flightState.getResultMap().orElse(null);
                    if (resultMap == null) {
                        throw new FileSystemCorruptException("no result map in flight state");
                    }
                    String fileId = resultMap.get(FileMapKeys.FILE_ID, String.class);
                    FSFileInfo fileInfo = resultMap.get(FileMapKeys.FILE_INFO, FSFileInfo.class);
                    loadService.setLoadFileSucceeded(loadId, loadFile.getTargetPath(), fileId, fileInfo);
                    break;
                }
        }
    }
    candidates.failedLoads(failureCount).runningLoads(realRunningLoads);
    logger.debug("Candidates resolved: failedLoads={}  runningLoads={}  candidateFiles={}", candidates.getFailedLoads(), candidates.getRunningLoads().size(), candidates.getCandidateFiles().size());
    return candidates;
}
Also used : FlightState(bio.terra.stairway.FlightState) FSFileInfo(bio.terra.service.filedata.FSFileInfo) LoadCandidates(bio.terra.service.load.LoadCandidates) FileSystemCorruptException(bio.terra.service.filedata.exception.FileSystemCorruptException) LoadFile(bio.terra.service.load.LoadFile) FlightMap(bio.terra.stairway.FlightMap) LinkedList(java.util.LinkedList)

Example 2 with FSFileInfo

use of bio.terra.service.filedata.FSFileInfo in project jade-data-repo by DataBiosphere.

the class LoadDaoUnitTest method loadFilesTest.

@Test
public void loadFilesTest() throws Exception {
    UUID loadId = populateFiles(8);
    // First set of candidates
    LoadCandidates candidates = loadDao.findCandidates(loadId, 3);
    testLoadCandidates(candidates, 0, 0, 3);
    List<LoadFile> loadSet1 = candidates.getCandidateFiles();
    FSFileInfo fsFileInfo;
    fsFileInfo = new FSFileInfo().checksumCrc32c("crcChecksum").checksumMd5("md5Checksum");
    loadDao.setLoadFileSucceeded(loadId, loadSet1.get(0).getTargetPath(), "fileidA", fsFileInfo);
    loadDao.setLoadFileFailed(loadId, loadSet1.get(1).getTargetPath(), "failureB");
    loadDao.setLoadFileRunning(loadId, loadSet1.get(2).getTargetPath(), FlightIdsUsedByTest.FLIGHT_C.getId());
    // Second set of candidates - set prior running to succeeded
    candidates = loadDao.findCandidates(loadId, 3);
    testLoadCandidates(candidates, 1, 1, 3);
    List<LoadFile> loadSet2 = candidates.getCandidateFiles();
    loadDao.setLoadFileSucceeded(loadId, loadSet1.get(2).getTargetPath(), "fileidC", fsFileInfo);
    loadDao.setLoadFileRunning(loadId, loadSet2.get(0).getTargetPath(), FlightIdsUsedByTest.FLIGHT_D.getId());
    loadDao.setLoadFileRunning(loadId, loadSet2.get(1).getTargetPath(), FlightIdsUsedByTest.FLIGHT_E.getId());
    loadDao.setLoadFileRunning(loadId, loadSet2.get(2).getTargetPath(), FlightIdsUsedByTest.FLIGHT_F.getId());
    // Third set of candidates - set all 3 prior to failed
    candidates = loadDao.findCandidates(loadId, 3);
    testLoadCandidates(candidates, 1, 3, 2);
    List<LoadFile> loadSet3 = candidates.getCandidateFiles();
    loadDao.setLoadFileFailed(loadId, loadSet2.get(0).getTargetPath(), "errorD");
    loadDao.setLoadFileFailed(loadId, loadSet2.get(1).getTargetPath(), "errorE");
    loadDao.setLoadFileFailed(loadId, loadSet2.get(2).getTargetPath(), "errorF");
    loadDao.setLoadFileRunning(loadId, loadSet3.get(0).getTargetPath(), FlightIdsUsedByTest.FLIGHT_G.getId());
    loadDao.setLoadFileRunning(loadId, loadSet3.get(1).getTargetPath(), FlightIdsUsedByTest.FLIGHT_H.getId());
    // No more candidates, but things are still running
    candidates = loadDao.findCandidates(loadId, 3);
    testLoadCandidates(candidates, 4, 2, 0);
    loadDao.setLoadFileSucceeded(loadId, loadSet3.get(0).getTargetPath(), "fileidG", fsFileInfo);
    loadDao.setLoadFileSucceeded(loadId, loadSet3.get(1).getTargetPath(), "fileidH", fsFileInfo);
    // No more candidates and nothing running; this would be the bulk load completed state
    candidates = loadDao.findCandidates(loadId, 3);
    testLoadCandidates(candidates, 4, 0, 0);
    // clean up after ourselves - check that we properly find nothing
    loadDao.cleanFiles(loadId);
    candidates = loadDao.findCandidates(loadId, 3);
    testLoadCandidates(candidates, 0, 0, 0);
}
Also used : FSFileInfo(bio.terra.service.filedata.FSFileInfo) UUID(java.util.UUID) Test(org.junit.Test) SpringBootTest(org.springframework.boot.test.context.SpringBootTest)

Example 3 with FSFileInfo

use of bio.terra.service.filedata.FSFileInfo in project jade-data-repo by DataBiosphere.

the class IngestFileFileStep method doStep.

@Override
public StepResult doStep(FlightContext context) {
    FlightMap workingMap = context.getWorkingMap();
    Boolean loadComplete = workingMap.get(FileMapKeys.LOAD_COMPLETED, Boolean.class);
    if (loadComplete == null || !loadComplete) {
        FlightMap inputParameters = context.getInputParameters();
        FileLoadModel fileLoadModel = inputParameters.get(JobMapKeys.REQUEST.getKeyName(), FileLoadModel.class);
        FSFileInfo fsFileInfo = workingMap.get(FileMapKeys.FILE_INFO, FSFileInfo.class);
        String fileId = workingMap.get(FileMapKeys.FILE_ID, String.class);
        FireStoreFile newFile = new FireStoreFile().fileId(fileId).mimeType(fileLoadModel.getMimeType()).description(fileLoadModel.getDescription()).bucketResourceId(fsFileInfo.getBucketResourceId()).fileCreatedDate(fsFileInfo.getCreatedDate()).gspath(fsFileInfo.getGspath()).checksumCrc32c(fsFileInfo.getChecksumCrc32c()).checksumMd5(fsFileInfo.getChecksumMd5()).size(fsFileInfo.getSize()).loadTag(fileLoadModel.getLoadTag());
        try {
            fileDao.createFileMetadata(dataset, newFile);
            // Retrieve to build the complete FSItem
            FSItem fsItem = fileDao.retrieveById(dataset, fileId, 1, true);
            workingMap.put(JobMapKeys.RESPONSE.getKeyName(), fileService.fileModelFromFSItem(fsItem));
        } catch (FileSystemAbortTransactionException rex) {
            return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY, rex);
        }
    }
    return StepResult.getStepResultSuccess();
}
Also used : FireStoreFile(bio.terra.service.filedata.google.firestore.FireStoreFile) FSFileInfo(bio.terra.service.filedata.FSFileInfo) FSItem(bio.terra.service.filedata.FSItem) FlightMap(bio.terra.stairway.FlightMap) FileLoadModel(bio.terra.model.FileLoadModel) FileSystemAbortTransactionException(bio.terra.service.filedata.exception.FileSystemAbortTransactionException) StepResult(bio.terra.stairway.StepResult)

Example 4 with FSFileInfo

use of bio.terra.service.filedata.FSFileInfo in project jade-data-repo by DataBiosphere.

the class IngestFilePrimaryDataStep method doStep.

@Override
public StepResult doStep(FlightContext context) {
    FlightMap inputParameters = context.getInputParameters();
    FileLoadModel fileLoadModel = inputParameters.get(JobMapKeys.REQUEST.getKeyName(), FileLoadModel.class);
    FlightMap workingMap = context.getWorkingMap();
    String fileId = workingMap.get(FileMapKeys.FILE_ID, String.class);
    Boolean loadComplete = workingMap.get(FileMapKeys.LOAD_COMPLETED, Boolean.class);
    if (loadComplete == null || !loadComplete) {
        // The bucket has been selected for this file. In the single file load case, the info
        // is stored in the working map. In the bulk load case, the info is stored in the input
        // parameters.
        GoogleBucketResource bucketResource = inputParameters.get(FileMapKeys.BUCKET_INFO, GoogleBucketResource.class);
        if (bucketResource == null) {
            bucketResource = workingMap.get(FileMapKeys.BUCKET_INFO, GoogleBucketResource.class);
        }
        FSFileInfo fsFileInfo;
        if (configService.testInsertFault(ConfigEnum.LOAD_SKIP_FILE_LOAD)) {
            fsFileInfo = new FSFileInfo().fileId(fileId).bucketResourceId(bucketResource.getResourceId().toString()).checksumCrc32c(null).checksumMd5("baaaaaad").createdDate(Instant.now().toString()).gspath("gs://path").size(100L);
        } else {
            fsFileInfo = gcsPdao.copyFile(dataset, fileLoadModel, fileId, bucketResource);
        }
        workingMap.put(FileMapKeys.FILE_INFO, fsFileInfo);
    }
    return StepResult.getStepResultSuccess();
}
Also used : GoogleBucketResource(bio.terra.service.resourcemanagement.google.GoogleBucketResource) FSFileInfo(bio.terra.service.filedata.FSFileInfo) FlightMap(bio.terra.stairway.FlightMap) FileLoadModel(bio.terra.model.FileLoadModel)

Example 5 with FSFileInfo

use of bio.terra.service.filedata.FSFileInfo in project jade-data-repo by DataBiosphere.

the class GcsPdao method copyFile.

public FSFileInfo copyFile(Dataset dataset, FileLoadModel fileLoadModel, String fileId, GoogleBucketResource bucketResource) {
    Storage storage = storageForBucket(bucketResource);
    Blob sourceBlob = getBlobFromGsPath(storage, fileLoadModel.getSourcePath());
    // Our path is /<dataset-id>/<file-id>
    String targetPath = dataset.getId().toString() + "/" + fileId;
    try {
        // The documentation is vague whether or not it is important to copy by chunk. One set of
        // examples does it and another doesn't.
        // 
        // I have been seeing timeouts and I think they are due to particularly large files,
        // so I changed exported the timeouts to application.properties to allow for tuning
        // and I am changing this to copy chunks.
        CopyWriter writer = sourceBlob.copyTo(BlobId.of(bucketResource.getName(), targetPath));
        while (!writer.isDone()) {
            writer.copyChunk();
        }
        Blob targetBlob = writer.getResult();
        // MD5 is computed per-component. So if there are multiple components, the MD5 here is
        // not useful for validating the contents of the file on access. Therefore, we only
        // return the MD5 if there is only a single component. For more details,
        // see https://cloud.google.com/storage/docs/hashes-etags
        Integer componentCount = targetBlob.getComponentCount();
        String checksumMd5 = null;
        if (componentCount == null || componentCount == 1) {
            checksumMd5 = targetBlob.getMd5ToHexString();
        }
        // Grumble! It is not documented what the meaning of the Long is.
        // From poking around I think it is a standard POSIX milliseconds since Jan 1, 1970.
        Instant createTime = Instant.ofEpochMilli(targetBlob.getCreateTime());
        URI gspath = new URI("gs", bucketResource.getName(), "/" + targetPath, null, null);
        FSFileInfo fsFileInfo = new FSFileInfo().fileId(fileId).createdDate(createTime.toString()).gspath(gspath.toString()).checksumCrc32c(targetBlob.getCrc32cToHexString()).checksumMd5(checksumMd5).size(targetBlob.getSize()).bucketResourceId(bucketResource.getResourceId().toString());
        return fsFileInfo;
    } catch (StorageException ex) {
        // for flaky google case or we might need to bail out if access is denied.
        throw new PdaoFileCopyException("File ingest failed", ex);
    } catch (URISyntaxException ex) {
        throw new PdaoException("Bad URI of our own making", ex);
    }
}
Also used : Blob(com.google.cloud.storage.Blob) Storage(com.google.cloud.storage.Storage) FSFileInfo(bio.terra.service.filedata.FSFileInfo) PdaoException(bio.terra.common.exception.PdaoException) Instant(java.time.Instant) PdaoFileCopyException(bio.terra.common.exception.PdaoFileCopyException) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) StorageException(com.google.cloud.storage.StorageException) CopyWriter(com.google.cloud.storage.CopyWriter)

Aggregations

FSFileInfo (bio.terra.service.filedata.FSFileInfo)5 FlightMap (bio.terra.stairway.FlightMap)3 FileLoadModel (bio.terra.model.FileLoadModel)2 PdaoException (bio.terra.common.exception.PdaoException)1 PdaoFileCopyException (bio.terra.common.exception.PdaoFileCopyException)1 FSItem (bio.terra.service.filedata.FSItem)1 FileSystemAbortTransactionException (bio.terra.service.filedata.exception.FileSystemAbortTransactionException)1 FileSystemCorruptException (bio.terra.service.filedata.exception.FileSystemCorruptException)1 FireStoreFile (bio.terra.service.filedata.google.firestore.FireStoreFile)1 LoadCandidates (bio.terra.service.load.LoadCandidates)1 LoadFile (bio.terra.service.load.LoadFile)1 GoogleBucketResource (bio.terra.service.resourcemanagement.google.GoogleBucketResource)1 FlightState (bio.terra.stairway.FlightState)1 StepResult (bio.terra.stairway.StepResult)1 Blob (com.google.cloud.storage.Blob)1 CopyWriter (com.google.cloud.storage.CopyWriter)1 Storage (com.google.cloud.storage.Storage)1 StorageException (com.google.cloud.storage.StorageException)1 URI (java.net.URI)1 URISyntaxException (java.net.URISyntaxException)1