use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.
the class IngestServiceBean method startIngestJobs.
// TODO: consider creating a version of this method that would take
// datasetversion as the argument.
// -- L.A. 4.6
// @Asynchronous - just an experiment...
public void startIngestJobs(Dataset dataset, AuthenticatedUser user) {
int count = 0;
List<DataFile> scheduledFiles = new ArrayList<>();
IngestMessage ingestMessage = null;
for (DataFile dataFile : dataset.getFiles()) {
if (dataFile.isIngestScheduled()) {
// todo: investigate why when calling save with the file object
// gotten from the loop, the roles assignment added at create is removed
// (switching to refinding via id resolves that)
dataFile = fileService.find(dataFile.getId());
long ingestSizeLimit = -1;
try {
ingestSizeLimit = systemConfig.getTabularIngestSizeLimit(getTabDataReaderByMimeType(dataFile.getContentType()).getFormatName());
} catch (IOException ioex) {
logger.warning("IO Exception trying to retrieve the ingestable format identifier from the plugin for type " + dataFile.getContentType() + " (non-fatal);");
}
if (ingestSizeLimit == -1 || dataFile.getFilesize() < ingestSizeLimit) {
dataFile.SetIngestInProgress();
dataFile = fileService.save(dataFile);
scheduledFiles.add(dataFile);
logger.fine("Attempting to queue the file " + dataFile.getFileMetadata().getLabel() + " for ingest, for dataset: " + dataset.getGlobalId());
count++;
} else {
dataFile.setIngestDone();
dataFile = fileService.save(dataFile);
logger.info("Skipping tabular ingest of the file " + dataFile.getFileMetadata().getLabel() + ", because of the size limit (set to " + ingestSizeLimit + " bytes).");
}
}
}
if (count > 0) {
String info = "Ingest of " + count + " tabular data file(s) is in progress.";
logger.info(info);
datasetService.addDatasetLock(dataset.getId(), DatasetLock.Reason.Ingest, (user != null) ? user.getId() : null, info);
DataFile[] scheduledFilesArray = (DataFile[]) scheduledFiles.toArray(new DataFile[count]);
scheduledFiles = null;
// Sort ingest jobs by file size:
Arrays.sort(scheduledFilesArray, new Comparator<DataFile>() {
@Override
public int compare(DataFile d1, DataFile d2) {
long a = d1.getFilesize();
long b = d2.getFilesize();
return Long.valueOf(a).compareTo(b);
}
});
ingestMessage = new IngestMessage(IngestMessage.INGEST_MESAGE_LEVEL_INFO);
for (int i = 0; i < count; i++) {
ingestMessage.addFileId(scheduledFilesArray[i].getId());
logger.fine("Sorted order: " + i + " (size=" + scheduledFilesArray[i].getFilesize() + ")");
}
QueueConnection conn = null;
QueueSession session = null;
QueueSender sender = null;
try {
conn = factory.createQueueConnection();
session = conn.createQueueSession(false, 0);
sender = session.createSender(queue);
// ingestMessage.addFile(new File(tempFileLocation));
Message message = session.createObjectMessage(ingestMessage);
// try {
sender.send(message);
// } catch (JMSException ex) {
// ex.printStackTrace();
// }
} catch (JMSException ex) {
ex.printStackTrace();
// throw new IOException(ex.getMessage());
} finally {
try {
if (sender != null) {
sender.close();
}
if (session != null) {
session.close();
}
if (conn != null) {
conn.close();
}
} catch (JMSException ex) {
ex.printStackTrace();
}
}
}
}
use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.
the class IngestServiceBean method fixMissingOriginalType.
// This method fixes a datatable object that's missing the format type of
// the ingested original. It will check the saved original file to
// determine the type.
private void fixMissingOriginalType(long fileId) {
DataFile dataFile = fileService.find(fileId);
if (dataFile != null && dataFile.isTabularData()) {
String originalFormat = dataFile.getDataTable().getOriginalFileFormat();
Long datatableId = dataFile.getDataTable().getId();
if (StringUtil.isEmpty(originalFormat) || originalFormat.equals(FileUtil.MIME_TYPE_TAB)) {
// We need to determine the mime type of the saved original
// and save it in the database.
//
// First, we need access to the file. Note that the code below
// works with any supported StorageIO driver (although, as of now
// all the production installations out there are only using filesystem
// access; but just in case)
// The FileUtil method that determines the type takes java.io.File
// as an argument. So for StorageIO drivers that provide local
// file access, we'll just go directly to the stored file. For
// swift and similar implementations, we'll read the saved aux
// channel and save it as a local temp file.
StorageIO<DataFile> storageIO;
File savedOriginalFile = null;
boolean tempFileRequired = false;
try {
storageIO = dataFile.getStorageIO();
storageIO.open();
if (storageIO.isLocalFile()) {
try {
savedOriginalFile = storageIO.getAuxObjectAsPath(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION).toFile();
} catch (IOException ioex) {
// do nothing, just make sure savedOriginalFile is still null:
savedOriginalFile = null;
}
}
if (savedOriginalFile == null) {
tempFileRequired = true;
ReadableByteChannel savedOriginalChannel = (ReadableByteChannel) storageIO.openAuxChannel(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);
savedOriginalFile = File.createTempFile("tempSavedOriginal", ".tmp");
FileChannel tempSavedOriginalChannel = new FileOutputStream(savedOriginalFile).getChannel();
tempSavedOriginalChannel.transferFrom(savedOriginalChannel, 0, storageIO.getAuxObjectSize(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION));
}
} catch (Exception ex) {
logger.warning("Exception " + ex.getClass() + " caught trying to open StorageIO channel for the saved original; (datafile id=" + fileId + ", datatable id=" + datatableId + "): " + ex.getMessage());
savedOriginalFile = null;
}
if (savedOriginalFile == null) {
logger.warning("Could not obtain the saved original file as a java.io.File! (datafile id=" + fileId + ", datatable id=" + datatableId + ")");
return;
}
String fileTypeDetermined = null;
try {
fileTypeDetermined = FileUtil.determineFileType(savedOriginalFile, "");
} catch (IOException ioex) {
logger.warning("Caught exception trying to determine original file type (datafile id=" + fileId + ", datatable id=" + datatableId + "): " + ioex.getMessage());
}
// If we had to create a temp file, delete it now:
if (tempFileRequired) {
savedOriginalFile.delete();
}
if (fileTypeDetermined == null) {
logger.warning("Failed to determine preserved original file type. (datafile id=" + fileId + ", datatable id=" + datatableId + ")");
return;
}
// it really means it must be a CSV file.
if (fileTypeDetermined.startsWith("text/plain")) {
fileTypeDetermined = FileUtil.MIME_TYPE_CSV;
}
// and, finally, if it is still "application/octet-stream", it must be Excel:
if (FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT.equals(fileTypeDetermined)) {
fileTypeDetermined = FileUtil.MIME_TYPE_XLSX;
}
logger.info("Original file type determined: " + fileTypeDetermined + " (file id=" + fileId + ", datatable id=" + datatableId + "; file path: " + savedOriginalFile.getAbsolutePath() + ")");
// save permanently in the database:
dataFile.getDataTable().setOriginalFileFormat(fileTypeDetermined);
fileService.saveDataTable(dataFile.getDataTable());
} else {
logger.info("DataFile id=" + fileId + "; original type already present: " + originalFormat);
}
} else {
logger.warning("DataFile id=" + fileId + ": No such DataFile!");
}
}
use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.
the class IngestServiceBean method extractMetadata.
/*
* extractMetadata:
* framework for extracting metadata from uploaded files. The results will
* be used to populate the metadata of the Dataset to which the file belongs.
*/
public boolean extractMetadata(String tempFileLocation, DataFile dataFile, DatasetVersion editVersion) throws IOException {
boolean ingestSuccessful = false;
FileInputStream tempFileInputStream = null;
try {
tempFileInputStream = new FileInputStream(new File(tempFileLocation));
} catch (FileNotFoundException notfoundEx) {
throw new IOException("Could not open temp file " + tempFileLocation);
}
// Locate metadata extraction plugin for the file format by looking
// it up with the Ingest Service Provider Registry:
// FileMetadataExtractor extractorPlugin = IngestSP.getMetadataExtractorByMIMEType(dfile.getContentType());
FileMetadataExtractor extractorPlugin = new FITSFileMetadataExtractor();
FileMetadataIngest extractedMetadata = extractorPlugin.ingest(new BufferedInputStream(tempFileInputStream));
Map<String, Set<String>> extractedMetadataMap = extractedMetadata.getMetadataMap();
// Store the fields and values we've gathered for safe-keeping:
// from 3.6:
// attempt to ingest the extracted metadata into the database;
// TODO: this should throw an exception if anything goes wrong.
FileMetadata fileMetadata = dataFile.getFileMetadata();
if (extractedMetadataMap != null) {
logger.fine("Ingest Service: Processing extracted metadata;");
if (extractedMetadata.getMetadataBlockName() != null) {
logger.fine("Ingest Service: This metadata belongs to the " + extractedMetadata.getMetadataBlockName() + " metadata block.");
processDatasetMetadata(extractedMetadata, editVersion);
}
processFileLevelMetadata(extractedMetadata, fileMetadata);
}
ingestSuccessful = true;
return ingestSuccessful;
}
use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.
the class IngestServiceBean method produceCharacterSummaryStatistics.
public void produceCharacterSummaryStatistics(DataFile dataFile, File generatedTabularFile) throws IOException {
for (int i = 0; i < dataFile.getDataTable().getVarQuantity(); i++) {
if (dataFile.getDataTable().getDataVariables().get(i).isTypeCharacter()) {
StorageIO<DataFile> storageIO = dataFile.getStorageIO();
storageIO.open();
logger.fine("subsetting character vector");
String[] variableVector = TabularSubsetGenerator.subsetStringVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue());
// calculateCharacterSummaryStatistics(dataFile, i, variableVector);
// calculate the UNF while we are at it:
logger.fine("Calculating UNF on a String vector");
calculateUNF(dataFile, i, variableVector);
logger.fine("Done! (character)");
variableVector = null;
}
}
}
use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.
the class HarvesterServiceBean method deleteHarvestedDataset.
private void deleteHarvestedDataset(Dataset dataset, DataverseRequest request, Logger hdLogger) {
// Purge all the SOLR documents associated with this client from the
// index server:
indexService.deleteHarvestedDocuments(dataset);
try {
// DeleteFileCommand on them.
for (DataFile harvestedFile : dataset.getFiles()) {
DataFile merged = em.merge(harvestedFile);
em.remove(merged);
harvestedFile = null;
}
dataset.setFiles(null);
Dataset merged = em.merge(dataset);
engineService.submit(new DeleteDatasetCommand(request, merged));
} catch (IllegalCommandException ex) {
// TODO: log the result
} catch (PermissionException ex) {
// TODO: log the result
} catch (CommandException ex) {
// TODO: log the result
}
// TODO: log the success result
}
Aggregations