use of bio.terra.service.filedata.FSFileInfo in project jade-data-repo by DataBiosphere.
the class IngestDriverStep method getLoadCandidates.
private LoadCandidates getLoadCandidates(FlightContext context, UUID loadId, int concurrentLoads) throws DatabaseOperationException, InterruptedException {
// We start by getting the database view of the state of loads.
// For the running loads, we ask Stairway what the actual state is.
// If they have completed, we mark them as such.
// We then update the failure count and runnings loads list in the
// LoadCandidates so it correctly reflects the running state
// right now (more or less).
LoadCandidates candidates = loadService.findCandidates(loadId, concurrentLoads);
logger.debug("Candidates from db: failedLoads={} runningLoads={} candidateFiles={}", candidates.getFailedLoads(), candidates.getRunningLoads().size(), candidates.getCandidateFiles().size());
int failureCount = candidates.getFailedLoads();
List<LoadFile> realRunningLoads = new LinkedList<>();
for (LoadFile loadFile : candidates.getRunningLoads()) {
FlightState flightState = context.getStairway().getFlightState(loadFile.getFlightId());
switch(flightState.getFlightStatus()) {
case RUNNING:
case WAITING:
case READY:
case QUEUED:
realRunningLoads.add(loadFile);
break;
case ERROR:
case FATAL:
{
String error = "unknown error";
if (flightState.getException().isPresent()) {
error = flightState.getException().get().toString();
}
loadService.setLoadFileFailed(loadId, loadFile.getTargetPath(), error);
failureCount++;
break;
}
case SUCCESS:
{
FlightMap resultMap = flightState.getResultMap().orElse(null);
if (resultMap == null) {
throw new FileSystemCorruptException("no result map in flight state");
}
String fileId = resultMap.get(FileMapKeys.FILE_ID, String.class);
FSFileInfo fileInfo = resultMap.get(FileMapKeys.FILE_INFO, FSFileInfo.class);
loadService.setLoadFileSucceeded(loadId, loadFile.getTargetPath(), fileId, fileInfo);
break;
}
}
}
candidates.failedLoads(failureCount).runningLoads(realRunningLoads);
logger.debug("Candidates resolved: failedLoads={} runningLoads={} candidateFiles={}", candidates.getFailedLoads(), candidates.getRunningLoads().size(), candidates.getCandidateFiles().size());
return candidates;
}
use of bio.terra.service.filedata.FSFileInfo in project jade-data-repo by DataBiosphere.
the class LoadDaoUnitTest method loadFilesTest.
@Test
public void loadFilesTest() throws Exception {
UUID loadId = populateFiles(8);
// First set of candidates
LoadCandidates candidates = loadDao.findCandidates(loadId, 3);
testLoadCandidates(candidates, 0, 0, 3);
List<LoadFile> loadSet1 = candidates.getCandidateFiles();
FSFileInfo fsFileInfo;
fsFileInfo = new FSFileInfo().checksumCrc32c("crcChecksum").checksumMd5("md5Checksum");
loadDao.setLoadFileSucceeded(loadId, loadSet1.get(0).getTargetPath(), "fileidA", fsFileInfo);
loadDao.setLoadFileFailed(loadId, loadSet1.get(1).getTargetPath(), "failureB");
loadDao.setLoadFileRunning(loadId, loadSet1.get(2).getTargetPath(), FlightIdsUsedByTest.FLIGHT_C.getId());
// Second set of candidates - set prior running to succeeded
candidates = loadDao.findCandidates(loadId, 3);
testLoadCandidates(candidates, 1, 1, 3);
List<LoadFile> loadSet2 = candidates.getCandidateFiles();
loadDao.setLoadFileSucceeded(loadId, loadSet1.get(2).getTargetPath(), "fileidC", fsFileInfo);
loadDao.setLoadFileRunning(loadId, loadSet2.get(0).getTargetPath(), FlightIdsUsedByTest.FLIGHT_D.getId());
loadDao.setLoadFileRunning(loadId, loadSet2.get(1).getTargetPath(), FlightIdsUsedByTest.FLIGHT_E.getId());
loadDao.setLoadFileRunning(loadId, loadSet2.get(2).getTargetPath(), FlightIdsUsedByTest.FLIGHT_F.getId());
// Third set of candidates - set all 3 prior to failed
candidates = loadDao.findCandidates(loadId, 3);
testLoadCandidates(candidates, 1, 3, 2);
List<LoadFile> loadSet3 = candidates.getCandidateFiles();
loadDao.setLoadFileFailed(loadId, loadSet2.get(0).getTargetPath(), "errorD");
loadDao.setLoadFileFailed(loadId, loadSet2.get(1).getTargetPath(), "errorE");
loadDao.setLoadFileFailed(loadId, loadSet2.get(2).getTargetPath(), "errorF");
loadDao.setLoadFileRunning(loadId, loadSet3.get(0).getTargetPath(), FlightIdsUsedByTest.FLIGHT_G.getId());
loadDao.setLoadFileRunning(loadId, loadSet3.get(1).getTargetPath(), FlightIdsUsedByTest.FLIGHT_H.getId());
// No more candidates, but things are still running
candidates = loadDao.findCandidates(loadId, 3);
testLoadCandidates(candidates, 4, 2, 0);
loadDao.setLoadFileSucceeded(loadId, loadSet3.get(0).getTargetPath(), "fileidG", fsFileInfo);
loadDao.setLoadFileSucceeded(loadId, loadSet3.get(1).getTargetPath(), "fileidH", fsFileInfo);
// No more candidates and nothing running; this would be the bulk load completed state
candidates = loadDao.findCandidates(loadId, 3);
testLoadCandidates(candidates, 4, 0, 0);
// clean up after ourselves - check that we properly find nothing
loadDao.cleanFiles(loadId);
candidates = loadDao.findCandidates(loadId, 3);
testLoadCandidates(candidates, 0, 0, 0);
}
use of bio.terra.service.filedata.FSFileInfo in project jade-data-repo by DataBiosphere.
the class IngestFileFileStep method doStep.
@Override
public StepResult doStep(FlightContext context) {
FlightMap workingMap = context.getWorkingMap();
Boolean loadComplete = workingMap.get(FileMapKeys.LOAD_COMPLETED, Boolean.class);
if (loadComplete == null || !loadComplete) {
FlightMap inputParameters = context.getInputParameters();
FileLoadModel fileLoadModel = inputParameters.get(JobMapKeys.REQUEST.getKeyName(), FileLoadModel.class);
FSFileInfo fsFileInfo = workingMap.get(FileMapKeys.FILE_INFO, FSFileInfo.class);
String fileId = workingMap.get(FileMapKeys.FILE_ID, String.class);
FireStoreFile newFile = new FireStoreFile().fileId(fileId).mimeType(fileLoadModel.getMimeType()).description(fileLoadModel.getDescription()).bucketResourceId(fsFileInfo.getBucketResourceId()).fileCreatedDate(fsFileInfo.getCreatedDate()).gspath(fsFileInfo.getGspath()).checksumCrc32c(fsFileInfo.getChecksumCrc32c()).checksumMd5(fsFileInfo.getChecksumMd5()).size(fsFileInfo.getSize()).loadTag(fileLoadModel.getLoadTag());
try {
fileDao.createFileMetadata(dataset, newFile);
// Retrieve to build the complete FSItem
FSItem fsItem = fileDao.retrieveById(dataset, fileId, 1, true);
workingMap.put(JobMapKeys.RESPONSE.getKeyName(), fileService.fileModelFromFSItem(fsItem));
} catch (FileSystemAbortTransactionException rex) {
return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY, rex);
}
}
return StepResult.getStepResultSuccess();
}
use of bio.terra.service.filedata.FSFileInfo in project jade-data-repo by DataBiosphere.
the class IngestFilePrimaryDataStep method doStep.
@Override
public StepResult doStep(FlightContext context) {
FlightMap inputParameters = context.getInputParameters();
FileLoadModel fileLoadModel = inputParameters.get(JobMapKeys.REQUEST.getKeyName(), FileLoadModel.class);
FlightMap workingMap = context.getWorkingMap();
String fileId = workingMap.get(FileMapKeys.FILE_ID, String.class);
Boolean loadComplete = workingMap.get(FileMapKeys.LOAD_COMPLETED, Boolean.class);
if (loadComplete == null || !loadComplete) {
// The bucket has been selected for this file. In the single file load case, the info
// is stored in the working map. In the bulk load case, the info is stored in the input
// parameters.
GoogleBucketResource bucketResource = inputParameters.get(FileMapKeys.BUCKET_INFO, GoogleBucketResource.class);
if (bucketResource == null) {
bucketResource = workingMap.get(FileMapKeys.BUCKET_INFO, GoogleBucketResource.class);
}
FSFileInfo fsFileInfo;
if (configService.testInsertFault(ConfigEnum.LOAD_SKIP_FILE_LOAD)) {
fsFileInfo = new FSFileInfo().fileId(fileId).bucketResourceId(bucketResource.getResourceId().toString()).checksumCrc32c(null).checksumMd5("baaaaaad").createdDate(Instant.now().toString()).gspath("gs://path").size(100L);
} else {
fsFileInfo = gcsPdao.copyFile(dataset, fileLoadModel, fileId, bucketResource);
}
workingMap.put(FileMapKeys.FILE_INFO, fsFileInfo);
}
return StepResult.getStepResultSuccess();
}
use of bio.terra.service.filedata.FSFileInfo in project jade-data-repo by DataBiosphere.
the class GcsPdao method copyFile.
public FSFileInfo copyFile(Dataset dataset, FileLoadModel fileLoadModel, String fileId, GoogleBucketResource bucketResource) {
Storage storage = storageForBucket(bucketResource);
Blob sourceBlob = getBlobFromGsPath(storage, fileLoadModel.getSourcePath());
// Our path is /<dataset-id>/<file-id>
String targetPath = dataset.getId().toString() + "/" + fileId;
try {
// The documentation is vague whether or not it is important to copy by chunk. One set of
// examples does it and another doesn't.
//
// I have been seeing timeouts and I think they are due to particularly large files,
// so I changed exported the timeouts to application.properties to allow for tuning
// and I am changing this to copy chunks.
CopyWriter writer = sourceBlob.copyTo(BlobId.of(bucketResource.getName(), targetPath));
while (!writer.isDone()) {
writer.copyChunk();
}
Blob targetBlob = writer.getResult();
// MD5 is computed per-component. So if there are multiple components, the MD5 here is
// not useful for validating the contents of the file on access. Therefore, we only
// return the MD5 if there is only a single component. For more details,
// see https://cloud.google.com/storage/docs/hashes-etags
Integer componentCount = targetBlob.getComponentCount();
String checksumMd5 = null;
if (componentCount == null || componentCount == 1) {
checksumMd5 = targetBlob.getMd5ToHexString();
}
// Grumble! It is not documented what the meaning of the Long is.
// From poking around I think it is a standard POSIX milliseconds since Jan 1, 1970.
Instant createTime = Instant.ofEpochMilli(targetBlob.getCreateTime());
URI gspath = new URI("gs", bucketResource.getName(), "/" + targetPath, null, null);
FSFileInfo fsFileInfo = new FSFileInfo().fileId(fileId).createdDate(createTime.toString()).gspath(gspath.toString()).checksumCrc32c(targetBlob.getCrc32cToHexString()).checksumMd5(checksumMd5).size(targetBlob.getSize()).bucketResourceId(bucketResource.getResourceId().toString());
return fsFileInfo;
} catch (StorageException ex) {
// for flaky google case or we might need to bail out if access is denied.
throw new PdaoFileCopyException("File ingest failed", ex);
} catch (URISyntaxException ex) {
throw new PdaoException("Bad URI of our own making", ex);
}
}
Aggregations