Search in sources :

Example 41 with FileMetadata

use of edu.harvard.iq.dataverse.FileMetadata in project dataverse by IQSS.

the class IndexServiceBean method indexDataset.

public Future<String> indexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) {
    logger.fine("indexing dataset " + dataset.getId());
    /**
     * @todo should we use solrDocIdentifierDataset or
     * IndexableObject.IndexableTypes.DATASET.getName() + "_" ?
     */
    // String solrIdPublished = solrDocIdentifierDataset + dataset.getId();
    String solrIdPublished = determinePublishedDatasetSolrDocId(dataset);
    String solrIdDraftDataset = IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.WORKING_COPY.getSuffix();
    // String solrIdDeaccessioned = IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.DEACCESSIONED.getSuffix();
    String solrIdDeaccessioned = determineDeaccessionedDatasetId(dataset);
    StringBuilder debug = new StringBuilder();
    debug.append("\ndebug:\n");
    int numPublishedVersions = 0;
    List<DatasetVersion> versions = dataset.getVersions();
    List<String> solrIdsOfFilesToDelete = new ArrayList<>();
    for (DatasetVersion datasetVersion : versions) {
        Long versionDatabaseId = datasetVersion.getId();
        String versionTitle = datasetVersion.getTitle();
        String semanticVersion = datasetVersion.getSemanticVersion();
        DatasetVersion.VersionState versionState = datasetVersion.getVersionState();
        if (versionState.equals(DatasetVersion.VersionState.RELEASED)) {
            numPublishedVersions += 1;
        }
        debug.append("version found with database id " + versionDatabaseId + "\n");
        debug.append("- title: " + versionTitle + "\n");
        debug.append("- semanticVersion-VersionState: " + semanticVersion + "-" + versionState + "\n");
        List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas();
        List<String> fileInfo = new ArrayList<>();
        for (FileMetadata fileMetadata : fileMetadatas) {
            String solrIdOfPublishedFile = solrDocIdentifierFile + fileMetadata.getDataFile().getId();
            /**
             * It sounds weird but the first thing we'll do is preemptively
             * delete the Solr documents of all published files. Don't
             * worry, published files will be re-indexed later along with
             * the dataset. We do this so users can delete files from
             * published versions of datasets and then re-publish a new
             * version without fear that their old published files (now
             * deleted from the latest published version) will be
             * searchable. See also
             * https://github.com/IQSS/dataverse/issues/762
             */
            solrIdsOfFilesToDelete.add(solrIdOfPublishedFile);
            fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel());
        }
        try {
            /**
             * Preemptively delete *all* Solr documents for files associated
             * with the dataset based on a Solr query.
             *
             * We must query Solr for this information because the file has
             * been deleted from the database ( perhaps when Solr was down,
             * as reported in https://github.com/IQSS/dataverse/issues/2086
             * ) so the database doesn't even know about the file. It's an
             * orphan.
             *
             * @todo This Solr query should make the iteration above based
             * on the database unnecessary because it the Solr query should
             * find all files for the dataset. We can probably remove the
             * iteration above after an "index all" has been performed.
             * Without an "index all" we won't be able to find files based
             * on parentId because that field wasn't searchable in 4.0.
             *
             * @todo We should also delete the corresponding Solr
             * "permission" documents for the files.
             */
            List<String> allFilesForDataset = findFilesOfParentDataset(dataset.getId());
            solrIdsOfFilesToDelete.addAll(allFilesForDataset);
        } catch (SearchException | NullPointerException ex) {
            logger.fine("could not run search of files to delete: " + ex);
        }
        int numFiles = 0;
        if (fileMetadatas != null) {
            numFiles = fileMetadatas.size();
        }
        debug.append("- files: " + numFiles + " " + fileInfo.toString() + "\n");
    }
    debug.append("numPublishedVersions: " + numPublishedVersions + "\n");
    if (doNormalSolrDocCleanUp) {
        IndexResponse resultOfAttemptToPremptivelyDeletePublishedFiles = solrIndexService.deleteMultipleSolrIds(solrIdsOfFilesToDelete);
        debug.append("result of attempt to premptively deleted published files before reindexing: " + resultOfAttemptToPremptivelyDeletePublishedFiles + "\n");
    }
    DatasetVersion latestVersion = dataset.getLatestVersion();
    String latestVersionStateString = latestVersion.getVersionState().name();
    DatasetVersion.VersionState latestVersionState = latestVersion.getVersionState();
    DatasetVersion releasedVersion = dataset.getReleasedVersion();
    boolean atLeastOnePublishedVersion = false;
    if (releasedVersion != null) {
        atLeastOnePublishedVersion = true;
    } else {
        atLeastOnePublishedVersion = false;
    }
    Map<DatasetVersion.VersionState, Boolean> desiredCards = new LinkedHashMap<>();
    /**
     * @todo refactor all of this below and have a single method that takes
     * the map of desired cards (which correspond to Solr documents) as one
     * of the arguments and does all the operations necessary to achieve the
     * desired state.
     */
    StringBuilder results = new StringBuilder();
    if (atLeastOnePublishedVersion == false) {
        results.append("No published version, nothing will be indexed as ").append(solrIdPublished).append("\n");
        if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) {
            desiredCards.put(DatasetVersion.VersionState.DRAFT, true);
            IndexableDataset indexableDraftVersion = new IndexableDataset(latestVersion);
            String indexDraftResult = addOrUpdateDataset(indexableDraftVersion);
            results.append("The latest version is a working copy (latestVersionState: ").append(latestVersionStateString).append(") and indexing was attempted for ").append(solrIdDraftDataset).append(" (limited discoverability). Result: ").append(indexDraftResult).append("\n");
            desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
            if (doNormalSolrDocCleanUp) {
                String deleteDeaccessionedResult = removeDeaccessioned(dataset);
                results.append("Draft exists, no need for deaccessioned version. Deletion attempted for ").append(solrIdDeaccessioned).append(" (and files). Result: ").append(deleteDeaccessionedResult).append("\n");
            }
            desiredCards.put(DatasetVersion.VersionState.RELEASED, false);
            if (doNormalSolrDocCleanUp) {
                String deletePublishedResults = removePublished(dataset);
                results.append("No published version. Attempting to delete traces of published version from index. Result: ").append(deletePublishedResults).append("\n");
            }
            /**
             * Desired state for existence of cards: {DRAFT=true,
             * DEACCESSIONED=false, RELEASED=false}
             *
             * No published version, nothing will be indexed as dataset_17
             *
             * The latest version is a working copy (latestVersionState:
             * DRAFT) and indexing was attempted for dataset_17_draft
             * (limited discoverability). Result: indexed dataset 17 as
             * dataset_17_draft. filesIndexed: [datafile_18_draft]
             *
             * Draft exists, no need for deaccessioned version. Deletion
             * attempted for dataset_17_deaccessioned (and files). Result:
             * Attempted to delete dataset_17_deaccessioned from Solr index.
             * updateReponse was:
             * {responseHeader={status=0,QTime=1}}Attempted to delete
             * datafile_18_deaccessioned from Solr index. updateReponse was:
             * {responseHeader={status=0,QTime=1}}
             *
             * No published version. Attempting to delete traces of
             * published version from index. Result: Attempted to delete
             * dataset_17 from Solr index. updateReponse was:
             * {responseHeader={status=0,QTime=1}}Attempted to delete
             * datafile_18 from Solr index. updateReponse was:
             * {responseHeader={status=0,QTime=0}}
             */
            String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
            logger.fine(result);
            indexDatasetPermissions(dataset);
            return new AsyncResult<>(result);
        } else if (latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) {
            desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, true);
            IndexableDataset indexableDeaccessionedVersion = new IndexableDataset(latestVersion);
            String indexDeaccessionedVersionResult = addOrUpdateDataset(indexableDeaccessionedVersion);
            results.append("No draft version. Attempting to index as deaccessioned. Result: ").append(indexDeaccessionedVersionResult).append("\n");
            desiredCards.put(DatasetVersion.VersionState.RELEASED, false);
            if (doNormalSolrDocCleanUp) {
                String deletePublishedResults = removePublished(dataset);
                results.append("No published version. Attempting to delete traces of published version from index. Result: ").append(deletePublishedResults).append("\n");
            }
            desiredCards.put(DatasetVersion.VersionState.DRAFT, false);
            if (doNormalSolrDocCleanUp) {
                List<String> solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset);
                String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset);
                String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete);
                results.append("Attempting to delete traces of drafts. Result: ").append(deleteDraftDatasetVersionResult).append(deleteDraftFilesResults).append("\n");
            }
            /**
             * Desired state for existence of cards: {DEACCESSIONED=true,
             * RELEASED=false, DRAFT=false}
             *
             * No published version, nothing will be indexed as dataset_17
             *
             * No draft version. Attempting to index as deaccessioned.
             * Result: indexed dataset 17 as dataset_17_deaccessioned.
             * filesIndexed: []
             *
             * No published version. Attempting to delete traces of
             * published version from index. Result: Attempted to delete
             * dataset_17 from Solr index. updateReponse was:
             * {responseHeader={status=0,QTime=0}}Attempted to delete
             * datafile_18 from Solr index. updateReponse was:
             * {responseHeader={status=0,QTime=3}}
             *
             * Attempting to delete traces of drafts. Result: Attempted to
             * delete dataset_17_draft from Solr index. updateReponse was:
             * {responseHeader={status=0,QTime=1}}
             */
            String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
            logger.fine(result);
            indexDatasetPermissions(dataset);
            return new AsyncResult<>(result);
        } else {
            String result = "No-op. Unexpected condition reached: No released version and latest version is neither draft nor deaccessioned";
            logger.fine(result);
            return new AsyncResult<>(result);
        }
    } else if (atLeastOnePublishedVersion == true) {
        results.append("Published versions found. ").append("Will attempt to index as ").append(solrIdPublished).append(" (discoverable by anonymous)\n");
        if (latestVersionState.equals(DatasetVersion.VersionState.RELEASED) || latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) {
            desiredCards.put(DatasetVersion.VersionState.RELEASED, true);
            IndexableDataset indexableReleasedVersion = new IndexableDataset(releasedVersion);
            String indexReleasedVersionResult = addOrUpdateDataset(indexableReleasedVersion);
            results.append("Attempted to index " + solrIdPublished).append(". Result: ").append(indexReleasedVersionResult).append("\n");
            desiredCards.put(DatasetVersion.VersionState.DRAFT, false);
            if (doNormalSolrDocCleanUp) {
                List<String> solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset);
                String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset);
                String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete);
                results.append("The latest version is published. Attempting to delete drafts. Result: ").append(deleteDraftDatasetVersionResult).append(deleteDraftFilesResults).append("\n");
            }
            desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
            if (doNormalSolrDocCleanUp) {
                String deleteDeaccessionedResult = removeDeaccessioned(dataset);
                results.append("No need for deaccessioned version. Deletion attempted for ").append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult);
            }
            /**
             * Desired state for existence of cards: {RELEASED=true,
             * DRAFT=false, DEACCESSIONED=false}
             *
             * Released versions found: 1. Will attempt to index as
             * dataset_17 (discoverable by anonymous)
             *
             * Attempted to index dataset_17. Result: indexed dataset 17 as
             * dataset_17. filesIndexed: [datafile_18]
             *
             * The latest version is published. Attempting to delete drafts.
             * Result: Attempted to delete dataset_17_draft from Solr index.
             * updateReponse was: {responseHeader={status=0,QTime=1}}
             *
             * No need for deaccessioned version. Deletion attempted for
             * dataset_17_deaccessioned. Result: Attempted to delete
             * dataset_17_deaccessioned from Solr index. updateReponse was:
             * {responseHeader={status=0,QTime=1}}Attempted to delete
             * datafile_18_deaccessioned from Solr index. updateReponse was:
             * {responseHeader={status=0,QTime=0}}
             */
            String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
            logger.fine(result);
            indexDatasetPermissions(dataset);
            return new AsyncResult<>(result);
        } else if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) {
            IndexableDataset indexableDraftVersion = new IndexableDataset(latestVersion);
            desiredCards.put(DatasetVersion.VersionState.DRAFT, true);
            String indexDraftResult = addOrUpdateDataset(indexableDraftVersion);
            results.append("The latest version is a working copy (latestVersionState: ").append(latestVersionStateString).append(") and will be indexed as ").append(solrIdDraftDataset).append(" (limited visibility). Result: ").append(indexDraftResult).append("\n");
            desiredCards.put(DatasetVersion.VersionState.RELEASED, true);
            IndexableDataset indexableReleasedVersion = new IndexableDataset(releasedVersion);
            String indexReleasedVersionResult = addOrUpdateDataset(indexableReleasedVersion);
            results.append("There is a published version we will attempt to index. Result: ").append(indexReleasedVersionResult).append("\n");
            desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
            if (doNormalSolrDocCleanUp) {
                String deleteDeaccessionedResult = removeDeaccessioned(dataset);
                results.append("No need for deaccessioned version. Deletion attempted for ").append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult);
            }
            /**
             * Desired state for existence of cards: {DRAFT=true,
             * RELEASED=true, DEACCESSIONED=false}
             *
             * Released versions found: 1. Will attempt to index as
             * dataset_17 (discoverable by anonymous)
             *
             * The latest version is a working copy (latestVersionState:
             * DRAFT) and will be indexed as dataset_17_draft (limited
             * visibility). Result: indexed dataset 17 as dataset_17_draft.
             * filesIndexed: [datafile_18_draft]
             *
             * There is a published version we will attempt to index.
             * Result: indexed dataset 17 as dataset_17. filesIndexed:
             * [datafile_18]
             *
             * No need for deaccessioned version. Deletion attempted for
             * dataset_17_deaccessioned. Result: Attempted to delete
             * dataset_17_deaccessioned from Solr index. updateReponse was:
             * {responseHeader={status=0,QTime=1}}Attempted to delete
             * datafile_18_deaccessioned from Solr index. updateReponse was:
             * {responseHeader={status=0,QTime=0}}
             */
            String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
            logger.fine(result);
            indexDatasetPermissions(dataset);
            return new AsyncResult<>(result);
        } else {
            String result = "No-op. Unexpected condition reached: There is at least one published version but the latest version is neither published nor draft";
            logger.fine(result);
            return new AsyncResult<>(result);
        }
    } else {
        String result = "No-op. Unexpected condition reached: Has a version been published or not?";
        logger.fine(result);
        return new AsyncResult<>(result);
    }
}
Also used : ArrayList(java.util.ArrayList) FileMetadata(edu.harvard.iq.dataverse.FileMetadata) DatasetVersion(edu.harvard.iq.dataverse.DatasetVersion) LinkedHashMap(java.util.LinkedHashMap) SolrDocumentList(org.apache.solr.common.SolrDocumentList) List(java.util.List) ArrayList(java.util.ArrayList) AsyncResult(javax.ejb.AsyncResult)

Example 42 with FileMetadata

use of edu.harvard.iq.dataverse.FileMetadata in project dataverse by IQSS.

the class IndexServiceBean method addOrUpdateDataset.

private String addOrUpdateDataset(IndexableDataset indexableDataset) {
    IndexableDataset.DatasetState state = indexableDataset.getDatasetState();
    Dataset dataset = indexableDataset.getDatasetVersion().getDataset();
    logger.fine("adding or updating Solr document for dataset id " + dataset.getId());
    Collection<SolrInputDocument> docs = new ArrayList<>();
    List<String> dataversePathSegmentsAccumulator = new ArrayList<>();
    List<String> dataverseSegments = new ArrayList<>();
    try {
        dataverseSegments = findPathSegments(dataset.getOwner(), dataversePathSegmentsAccumulator);
    } catch (Exception ex) {
        logger.info("failed to find dataverseSegments for dataversePaths for " + SearchFields.SUBTREE + ": " + ex);
    }
    List<String> dataversePaths = getDataversePathsFromSegments(dataverseSegments);
    // Add Paths for linking dataverses
    for (Dataverse linkingDataverse : dsLinkingService.findLinkingDataverses(dataset.getId())) {
        List<String> linkingDataversePathSegmentsAccumulator = new ArrayList<>();
        List<String> linkingdataverseSegments = findPathSegments(linkingDataverse, linkingDataversePathSegmentsAccumulator);
        List<String> linkingDataversePaths = getDataversePathsFromSegments(linkingdataverseSegments);
        for (String dvPath : linkingDataversePaths) {
            dataversePaths.add(dvPath);
        }
    }
    SolrInputDocument solrInputDocument = new SolrInputDocument();
    String datasetSolrDocId = indexableDataset.getSolrDocId();
    solrInputDocument.addField(SearchFields.ID, datasetSolrDocId);
    solrInputDocument.addField(SearchFields.ENTITY_ID, dataset.getId());
    String dataverseVersion = systemConfig.getVersion();
    solrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion);
    solrInputDocument.addField(SearchFields.IDENTIFIER, dataset.getGlobalId());
    solrInputDocument.addField(SearchFields.DATASET_PERSISTENT_ID, dataset.getGlobalId());
    solrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL());
    solrInputDocument.addField(SearchFields.TYPE, "datasets");
    Date datasetSortByDate = new Date();
    Date majorVersionReleaseDate = dataset.getMostRecentMajorVersionReleaseDate();
    if (majorVersionReleaseDate != null) {
        if (true) {
            String msg = "major release date found: " + majorVersionReleaseDate.toString();
            logger.fine(msg);
        }
        datasetSortByDate = majorVersionReleaseDate;
    } else {
        if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.WORKING_COPY)) {
            solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING);
        } else if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.DEACCESSIONED)) {
            solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DEACCESSIONED_STRING);
        }
        Date createDate = dataset.getCreateDate();
        if (createDate != null) {
            if (true) {
                String msg = "can't find major release date, using create date: " + createDate;
                logger.fine(msg);
            }
            datasetSortByDate = createDate;
        } else {
            String msg = "can't find major release date or create date, using \"now\"";
            logger.info(msg);
            datasetSortByDate = new Date();
        }
    }
    solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, datasetSortByDate);
    solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(datasetSortByDate));
    if (state.equals(indexableDataset.getDatasetState().PUBLISHED)) {
        solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING);
    // solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataset.getPublicationDate());
    } else if (state.equals(indexableDataset.getDatasetState().WORKING_COPY)) {
        solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING);
    }
    addDatasetReleaseDateToSolrDoc(solrInputDocument, dataset);
    if (dataset.isHarvested()) {
        solrInputDocument.addField(SearchFields.IS_HARVESTED, true);
        solrInputDocument.addField(SearchFields.METADATA_SOURCE, HARVESTED);
    } else {
        solrInputDocument.addField(SearchFields.IS_HARVESTED, false);
        // rootDataverseName);
        solrInputDocument.addField(SearchFields.METADATA_SOURCE, findRootDataverseCached().getName());
    }
    DatasetVersion datasetVersion = indexableDataset.getDatasetVersion();
    String parentDatasetTitle = "TBD";
    if (datasetVersion != null) {
        solrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId());
        solrInputDocument.addField(SearchFields.DATASET_CITATION, datasetVersion.getCitation(false));
        solrInputDocument.addField(SearchFields.DATASET_CITATION_HTML, datasetVersion.getCitation(true));
        if (datasetVersion.isInReview()) {
            solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, IN_REVIEW_STRING);
        }
        for (DatasetField dsf : datasetVersion.getFlatDatasetFields()) {
            DatasetFieldType dsfType = dsf.getDatasetFieldType();
            String solrFieldSearchable = dsfType.getSolrField().getNameSearchable();
            String solrFieldFacetable = dsfType.getSolrField().getNameFacetable();
            if (dsf.getValues() != null && !dsf.getValues().isEmpty() && dsf.getValues().get(0) != null && solrFieldSearchable != null) {
                logger.fine("indexing " + dsf.getDatasetFieldType().getName() + ":" + dsf.getValues() + " into " + solrFieldSearchable + " and maybe " + solrFieldFacetable);
                // if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.INTEGER)) {
                if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.EMAIL)) {
                // no-op. we want to keep email address out of Solr per https://github.com/IQSS/dataverse/issues/759
                } else if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.DATE)) {
                    String dateAsString = dsf.getValues().get(0);
                    logger.fine("date as string: " + dateAsString);
                    if (dateAsString != null && !dateAsString.isEmpty()) {
                        SimpleDateFormat inputDateyyyy = new SimpleDateFormat("yyyy", Locale.ENGLISH);
                        try {
                            /**
                             * @todo when bean validation is working we
                             * won't have to convert strings into dates
                             */
                            logger.fine("Trying to convert " + dateAsString + " to a YYYY date from dataset " + dataset.getId());
                            Date dateAsDate = inputDateyyyy.parse(dateAsString);
                            SimpleDateFormat yearOnly = new SimpleDateFormat("yyyy");
                            String datasetFieldFlaggedAsDate = yearOnly.format(dateAsDate);
                            logger.fine("YYYY only: " + datasetFieldFlaggedAsDate);
                            // solrInputDocument.addField(solrFieldSearchable, Integer.parseInt(datasetFieldFlaggedAsDate));
                            solrInputDocument.addField(solrFieldSearchable, datasetFieldFlaggedAsDate);
                            if (dsfType.getSolrField().isFacetable()) {
                                // solrInputDocument.addField(solrFieldFacetable, Integer.parseInt(datasetFieldFlaggedAsDate));
                                solrInputDocument.addField(solrFieldFacetable, datasetFieldFlaggedAsDate);
                            }
                        } catch (Exception ex) {
                            logger.info("unable to convert " + dateAsString + " into YYYY format and couldn't index it (" + dsfType.getName() + ")");
                        }
                    }
                } else {
                    if (dsf.getDatasetFieldType().getName().equals("authorAffiliation")) {
                        /**
                         * @todo think about how to tie the fact that this
                         * needs to be multivalued (_ss) because a
                         * multivalued facet (authorAffilition_ss) is being
                         * collapsed into here at index time. The business
                         * logic to determine if a data-driven metadata
                         * field should be indexed into Solr as a single or
                         * multiple value lives in the getSolrField() method
                         * of DatasetField.java
                         */
                        solrInputDocument.addField(SearchFields.AFFILIATION, dsf.getValuesWithoutNaValues());
                    } else if (dsf.getDatasetFieldType().getName().equals("title")) {
                        // datasets have titles not names but index title under name as well so we can sort datasets by name along dataverses and files
                        List<String> possibleTitles = dsf.getValues();
                        String firstTitle = possibleTitles.get(0);
                        if (firstTitle != null) {
                            parentDatasetTitle = firstTitle;
                        }
                        solrInputDocument.addField(SearchFields.NAME_SORT, dsf.getValues());
                    }
                    if (dsfType.isControlledVocabulary()) {
                        for (ControlledVocabularyValue controlledVocabularyValue : dsf.getControlledVocabularyValues()) {
                            if (controlledVocabularyValue.getStrValue().equals(DatasetField.NA_VALUE)) {
                                continue;
                            }
                            solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getStrValue());
                            if (dsfType.getSolrField().isFacetable()) {
                                solrInputDocument.addField(solrFieldFacetable, controlledVocabularyValue.getStrValue());
                            }
                        }
                    } else if (dsfType.getFieldType().equals(DatasetFieldType.FieldType.TEXTBOX)) {
                        // strip HTML
                        List<String> htmlFreeText = StringUtil.htmlArray2textArray(dsf.getValuesWithoutNaValues());
                        solrInputDocument.addField(solrFieldSearchable, htmlFreeText);
                        if (dsfType.getSolrField().isFacetable()) {
                            solrInputDocument.addField(solrFieldFacetable, htmlFreeText);
                        }
                    } else {
                        // do not strip HTML
                        solrInputDocument.addField(solrFieldSearchable, dsf.getValuesWithoutNaValues());
                        if (dsfType.getSolrField().isFacetable()) {
                            if (dsf.getDatasetFieldType().getName().equals(DatasetFieldConstant.topicClassValue)) {
                                String topicClassificationTerm = getTopicClassificationTermOrTermAndVocabulary(dsf);
                                if (topicClassificationTerm != null) {
                                    logger.fine(solrFieldFacetable + " gets " + topicClassificationTerm);
                                    solrInputDocument.addField(solrFieldFacetable, topicClassificationTerm);
                                }
                            } else {
                                solrInputDocument.addField(solrFieldFacetable, dsf.getValuesWithoutNaValues());
                            }
                        }
                    }
                }
            }
        }
    }
    solrInputDocument.addField(SearchFields.SUBTREE, dataversePaths);
    // solrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataset.getOwner().getName());
    solrInputDocument.addField(SearchFields.PARENT_ID, dataset.getOwner().getId());
    solrInputDocument.addField(SearchFields.PARENT_NAME, dataset.getOwner().getName());
    if (state.equals(indexableDataset.getDatasetState().DEACCESSIONED)) {
        String deaccessionNote = datasetVersion.getVersionNote();
        if (deaccessionNote != null) {
            solrInputDocument.addField(SearchFields.DATASET_DEACCESSION_REASON, deaccessionNote);
        }
    }
    docs.add(solrInputDocument);
    List<String> filesIndexed = new ArrayList<>();
    if (datasetVersion != null) {
        List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas();
        boolean checkForDuplicateMetadata = false;
        if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) {
            checkForDuplicateMetadata = true;
            logger.fine("We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions.");
        }
        for (FileMetadata fileMetadata : fileMetadatas) {
            boolean indexThisMetadata = true;
            if (checkForDuplicateMetadata) {
                logger.fine("Checking if this file metadata is a duplicate.");
                for (FileMetadata releasedFileMetadata : dataset.getReleasedVersion().getFileMetadatas()) {
                    if (fileMetadata.getDataFile() != null && fileMetadata.getDataFile().equals(releasedFileMetadata.getDataFile())) {
                        if (fileMetadata.contentEquals(releasedFileMetadata)) {
                            indexThisMetadata = false;
                            logger.fine("This file metadata hasn't changed since the released version; skipping indexing.");
                        } else {
                            logger.fine("This file metadata has changed since the released version; we want to index it!");
                        }
                        break;
                    }
                }
            }
            if (indexThisMetadata) {
                SolrInputDocument datafileSolrInputDocument = new SolrInputDocument();
                Long fileEntityId = fileMetadata.getDataFile().getId();
                datafileSolrInputDocument.addField(SearchFields.ENTITY_ID, fileEntityId);
                datafileSolrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion);
                datafileSolrInputDocument.addField(SearchFields.IDENTIFIER, fileEntityId);
                datafileSolrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL());
                datafileSolrInputDocument.addField(SearchFields.TYPE, "files");
                String filenameCompleteFinal = "";
                if (fileMetadata != null) {
                    String filenameComplete = fileMetadata.getLabel();
                    if (filenameComplete != null) {
                        String filenameWithoutExtension = "";
                        // String extension = "";
                        int i = filenameComplete.lastIndexOf('.');
                        if (i > 0) {
                            // extension = filenameComplete.substring(i + 1);
                            try {
                                filenameWithoutExtension = filenameComplete.substring(0, i);
                                datafileSolrInputDocument.addField(SearchFields.FILENAME_WITHOUT_EXTENSION, filenameWithoutExtension);
                                datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameWithoutExtension);
                            } catch (IndexOutOfBoundsException ex) {
                                filenameWithoutExtension = "";
                            }
                        } else {
                            logger.fine("problem with filename '" + filenameComplete + "': no extension? empty string as filename?");
                            filenameWithoutExtension = filenameComplete;
                        }
                        filenameCompleteFinal = filenameComplete;
                    }
                    for (String tag : fileMetadata.getCategoriesByName()) {
                        datafileSolrInputDocument.addField(SearchFields.FILE_TAG, tag);
                        datafileSolrInputDocument.addField(SearchFields.FILE_TAG_SEARCHABLE, tag);
                    }
                }
                datafileSolrInputDocument.addField(SearchFields.NAME, filenameCompleteFinal);
                datafileSolrInputDocument.addField(SearchFields.NAME_SORT, filenameCompleteFinal);
                datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameCompleteFinal);
                datafileSolrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId());
                /**
                 * for rules on sorting files see
                 * https://docs.google.com/a/harvard.edu/document/d/1DWsEqT8KfheKZmMB3n_VhJpl9nIxiUjai_AIQPAjiyA/edit?usp=sharing
                 * via https://redmine.hmdc.harvard.edu/issues/3701
                 */
                Date fileSortByDate = new Date();
                DataFile datafile = fileMetadata.getDataFile();
                if (datafile != null) {
                    boolean fileHasBeenReleased = datafile.isReleased();
                    if (fileHasBeenReleased) {
                        logger.fine("indexing file with filePublicationTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")");
                        Timestamp filePublicationTimestamp = datafile.getPublicationDate();
                        if (filePublicationTimestamp != null) {
                            fileSortByDate = filePublicationTimestamp;
                        } else {
                            String msg = "filePublicationTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")";
                            logger.info(msg);
                        }
                        datafileSolrInputDocument.addField(SearchFields.ACCESS, datafile.isRestricted() ? SearchConstants.RESTRICTED : SearchConstants.PUBLIC);
                    } else {
                        logger.fine("indexing file with fileCreateTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")");
                        Timestamp fileCreateTimestamp = datafile.getCreateDate();
                        if (fileCreateTimestamp != null) {
                            fileSortByDate = fileCreateTimestamp;
                        } else {
                            String msg = "fileCreateTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")";
                            logger.info(msg);
                        }
                        datafileSolrInputDocument.addField(SearchFields.ACCESS, fileMetadata.isRestricted() ? SearchConstants.RESTRICTED : SearchConstants.PUBLIC);
                    }
                    if (datafile.isHarvested()) {
                        datafileSolrInputDocument.addField(SearchFields.IS_HARVESTED, true);
                        datafileSolrInputDocument.addField(SearchFields.METADATA_SOURCE, HARVESTED);
                    } else {
                        datafileSolrInputDocument.addField(SearchFields.IS_HARVESTED, false);
                        datafileSolrInputDocument.addField(SearchFields.METADATA_SOURCE, findRootDataverseCached().getName());
                    }
                }
                if (fileSortByDate == null) {
                    if (datasetSortByDate != null) {
                        logger.info("fileSortByDate was null, assigning datasetSortByDate");
                        fileSortByDate = datasetSortByDate;
                    } else {
                        logger.info("fileSortByDate and datasetSortByDate were null, assigning 'now'");
                        fileSortByDate = new Date();
                    }
                }
                datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, fileSortByDate);
                datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(fileSortByDate));
                if (majorVersionReleaseDate == null && !datafile.isHarvested()) {
                    datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING);
                }
                if (datasetVersion.isInReview()) {
                    datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, IN_REVIEW_STRING);
                }
                String fileSolrDocId = solrDocIdentifierFile + fileEntityId;
                if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().PUBLISHED)) {
                    fileSolrDocId = solrDocIdentifierFile + fileEntityId;
                    datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING);
                    // datafileSolrInputDocument.addField(SearchFields.PERMS, publicGroupString);
                    addDatasetReleaseDateToSolrDoc(datafileSolrInputDocument, dataset);
                } else if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().WORKING_COPY)) {
                    fileSolrDocId = solrDocIdentifierFile + fileEntityId + indexableDataset.getDatasetState().getSuffix();
                    datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING);
                }
                datafileSolrInputDocument.addField(SearchFields.ID, fileSolrDocId);
                datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_FRIENDLY, fileMetadata.getDataFile().getFriendlyType());
                datafileSolrInputDocument.addField(SearchFields.FILE_CONTENT_TYPE, fileMetadata.getDataFile().getContentType());
                datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, fileMetadata.getDataFile().getFriendlyType());
                // For the file type facets, we have a property file that maps mime types
                // to facet-friendly names; "application/fits" should become "FITS", etc.:
                datafileSolrInputDocument.addField(SearchFields.FILE_TYPE, FileUtil.getFacetFileType(fileMetadata.getDataFile()));
                datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getFacetFileType(fileMetadata.getDataFile()));
                datafileSolrInputDocument.addField(SearchFields.FILE_SIZE_IN_BYTES, fileMetadata.getDataFile().getFilesize());
                if (DataFile.ChecksumType.MD5.equals(fileMetadata.getDataFile().getChecksumType())) {
                    /**
                     * @todo Someday we should probably deprecate this
                     * FILE_MD5 in favor of a combination of
                     * FILE_CHECKSUM_TYPE and FILE_CHECKSUM_VALUE.
                     */
                    datafileSolrInputDocument.addField(SearchFields.FILE_MD5, fileMetadata.getDataFile().getChecksumValue());
                }
                datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_TYPE, fileMetadata.getDataFile().getChecksumType().toString());
                datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_VALUE, fileMetadata.getDataFile().getChecksumValue());
                datafileSolrInputDocument.addField(SearchFields.DESCRIPTION, fileMetadata.getDescription());
                datafileSolrInputDocument.addField(SearchFields.FILE_DESCRIPTION, fileMetadata.getDescription());
                datafileSolrInputDocument.addField(SearchFields.UNF, fileMetadata.getDataFile().getUnf());
                datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths);
                // datafileSolrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataFile.getOwner().getOwner().getName());
                // datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, dataFile.getDataset().getTitle());
                datafileSolrInputDocument.addField(SearchFields.PARENT_ID, fileMetadata.getDataFile().getOwner().getId());
                datafileSolrInputDocument.addField(SearchFields.PARENT_IDENTIFIER, fileMetadata.getDataFile().getOwner().getGlobalId());
                datafileSolrInputDocument.addField(SearchFields.PARENT_CITATION, fileMetadata.getDataFile().getOwner().getCitation());
                datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, parentDatasetTitle);
                // names and labels:
                if (fileMetadata.getDataFile().isTabularData()) {
                    List<DataVariable> variables = fileMetadata.getDataFile().getDataTable().getDataVariables();
                    for (DataVariable var : variables) {
                        if (var.getName() != null && !var.getName().equals("")) {
                            datafileSolrInputDocument.addField(SearchFields.VARIABLE_NAME, var.getName());
                        }
                        if (var.getLabel() != null && !var.getLabel().equals("")) {
                            datafileSolrInputDocument.addField(SearchFields.VARIABLE_LABEL, var.getLabel());
                        }
                    }
                    // (not to be confused with the file categories, indexed above!)
                    for (DataFileTag tag : fileMetadata.getDataFile().getTags()) {
                        String tagLabel = tag.getTypeLabel();
                        datafileSolrInputDocument.addField(SearchFields.TABDATA_TAG, tagLabel);
                    }
                }
                if (indexableDataset.isFilesShouldBeIndexed()) {
                    filesIndexed.add(fileSolrDocId);
                    docs.add(datafileSolrInputDocument);
                }
            }
        }
    }
    try {
        solrServer.add(docs);
    } catch (SolrServerException | IOException ex) {
        return ex.toString();
    }
    try {
        solrServer.commit();
    } catch (SolrServerException | IOException ex) {
        return ex.toString();
    }
    Long dsId = dataset.getId();
    // /Dataset updatedDataset = (Dataset)dvObjectService.updateContentIndexTime(dataset);
    // /updatedDataset = null;
    // instead of making a call to dvObjectService, let's try and
    // modify the index time stamp using the local EntityManager:
    DvObject dvObjectToModify = em.find(DvObject.class, dsId);
    dvObjectToModify.setIndexTime(new Timestamp(new Date().getTime()));
    dvObjectToModify = em.merge(dvObjectToModify);
    dvObjectToModify = null;
    // return "indexed dataset " + dataset.getId() + " as " + solrDocId + "\nindexFilesResults for " + solrDocId + ":" + fileInfo.toString();
    return "indexed dataset " + dsId + " as " + datasetSolrDocId + ". filesIndexed: " + filesIndexed;
}
Also used : DatasetField(edu.harvard.iq.dataverse.DatasetField) DvObject(edu.harvard.iq.dataverse.DvObject) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ArrayList(java.util.ArrayList) FileMetadata(edu.harvard.iq.dataverse.FileMetadata) DatasetVersion(edu.harvard.iq.dataverse.DatasetVersion) DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable) Timestamp(java.sql.Timestamp) DataFile(edu.harvard.iq.dataverse.DataFile) SolrInputDocument(org.apache.solr.common.SolrInputDocument) ControlledVocabularyValue(edu.harvard.iq.dataverse.ControlledVocabularyValue) Dataset(edu.harvard.iq.dataverse.Dataset) IOException(java.io.IOException) Dataverse(edu.harvard.iq.dataverse.Dataverse) DatasetFieldType(edu.harvard.iq.dataverse.DatasetFieldType) SolrServerException(org.apache.solr.client.solrj.SolrServerException) EJBException(javax.ejb.EJBException) IOException(java.io.IOException) Date(java.util.Date) SimpleDateFormat(java.text.SimpleDateFormat) DataFileTag(edu.harvard.iq.dataverse.DataFileTag)

Example 43 with FileMetadata

use of edu.harvard.iq.dataverse.FileMetadata in project dataverse by IQSS.

the class DatasetUtil method getThumbnailCandidates.

public static List<DatasetThumbnail> getThumbnailCandidates(Dataset dataset, boolean considerDatasetLogoAsCandidate) {
    List<DatasetThumbnail> thumbnails = new ArrayList<>();
    if (dataset == null) {
        return thumbnails;
    }
    if (considerDatasetLogoAsCandidate) {
        // Path path = Paths.get(dataset.getFileSystemDirectory() + File.separator + datasetLogoThumbnail + thumb48addedByImageThumbConverter);
        // if (Files.exists(path)) {
        // logger.fine("Thumbnail created from dataset logo exists!");
        // File file = path.toFile();
        // try {
        // byte[] bytes = Files.readAllBytes(file.toPath());
        StorageIO<Dataset> dataAccess = null;
        try {
            dataAccess = DataAccess.getStorageIO(dataset);
        } catch (IOException ioex) {
        }
        InputStream in = null;
        try {
            if (dataAccess.getAuxFileAsInputStream(datasetLogoThumbnail + thumb48addedByImageThumbConverter) != null) {
                in = dataAccess.getAuxFileAsInputStream(datasetLogoThumbnail + thumb48addedByImageThumbConverter);
            }
        } catch (Exception ioex) {
        }
        if (in != null) {
            logger.fine("Thumbnail created from dataset logo exists!");
            try {
                byte[] bytes = IOUtils.toByteArray(in);
                String base64image = Base64.getEncoder().encodeToString(bytes);
                DatasetThumbnail datasetThumbnail = new DatasetThumbnail(FileUtil.DATA_URI_SCHEME + base64image, null);
                thumbnails.add(datasetThumbnail);
            } catch (IOException ex) {
                logger.warning("Unable to rescale image: " + ex);
            }
        } else {
            logger.fine("There is no thumbnail created from a dataset logo");
        }
    }
    for (FileMetadata fileMetadata : dataset.getLatestVersion().getFileMetadatas()) {
        DataFile dataFile = fileMetadata.getDataFile();
        if (dataFile != null && FileUtil.isThumbnailSupported(dataFile) && ImageThumbConverter.isThumbnailAvailable(dataFile) && !dataFile.isRestricted()) {
            String imageSourceBase64 = null;
            imageSourceBase64 = ImageThumbConverter.getImageThumbnailAsBase64(dataFile, ImageThumbConverter.DEFAULT_CARDIMAGE_SIZE);
            if (imageSourceBase64 != null) {
                DatasetThumbnail datasetThumbnail = new DatasetThumbnail(imageSourceBase64, dataFile);
                thumbnails.add(datasetThumbnail);
            }
        }
    }
    return thumbnails;
}
Also used : DataFile(edu.harvard.iq.dataverse.DataFile) Dataset(edu.harvard.iq.dataverse.Dataset) ByteArrayInputStream(java.io.ByteArrayInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) FileMetadata(edu.harvard.iq.dataverse.FileMetadata) IOException(java.io.IOException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) UnsupportedEncodingException(java.io.UnsupportedEncodingException)

Example 44 with FileMetadata

use of edu.harvard.iq.dataverse.FileMetadata in project dataverse by IQSS.

the class FileRecordWriter method createDataFile.

/**
 * Create a DatasetFile and corresponding FileMetadata for a file on the filesystem and add it to the
 * latest dataset version (if the user has AddDataset permissions for the dataset).
 * @param file file to create dataFile from
 * @return datafile
 */
private DataFile createDataFile(File file) {
    DatasetVersion version = dataset.getLatestVersion();
    String path = file.getAbsolutePath();
    String gid = dataset.getAuthority() + dataset.getDoiSeparator() + dataset.getIdentifier();
    String relativePath = path.substring(path.indexOf(gid) + gid.length() + 1);
    // we don't determine mime type
    DataFile datafile = new DataFile("application/octet-stream");
    datafile.setStorageIdentifier(relativePath);
    datafile.setFilesize(file.length());
    datafile.setModificationTime(new Timestamp(new Date().getTime()));
    datafile.setCreateDate(new Timestamp(new Date().getTime()));
    datafile.setPermissionModificationTime(new Timestamp(new Date().getTime()));
    datafile.setOwner(dataset);
    datafile.setIngestDone();
    // check system property first, otherwise use the batch job property
    String jobChecksumType;
    if (System.getProperty("checksumType") != null) {
        jobChecksumType = System.getProperty("checksumType");
    } else {
        jobChecksumType = checksumType;
    }
    // initial default
    datafile.setChecksumType(DataFile.ChecksumType.SHA1);
    for (DataFile.ChecksumType type : DataFile.ChecksumType.values()) {
        if (jobChecksumType.equalsIgnoreCase(type.name())) {
            datafile.setChecksumType(type);
            break;
        }
    }
    // lookup the checksum value in the job's manifest hashmap
    if (jobContext.getTransientUserData() != null) {
        String checksumVal = ((Map<String, String>) jobContext.getTransientUserData()).get(relativePath);
        if (checksumVal != null) {
            datafile.setChecksumValue(checksumVal);
            // remove the key, so we can check for unused checksums when the job is complete
            ((Map<String, String>) jobContext.getTransientUserData()).remove(relativePath);
        } else {
            datafile.setChecksumValue("Unknown");
            getJobLogger().log(Level.WARNING, "Unable to find checksum in manifest for: " + file.getAbsolutePath());
        }
    } else {
        getJobLogger().log(Level.SEVERE, "No checksum hashmap found in transientUserData");
        jobContext.setExitStatus("FAILED");
        return null;
    }
    // set metadata and add to latest version
    FileMetadata fmd = new FileMetadata();
    fmd.setLabel(file.getName());
    // set the subdirectory if there is one
    if (relativePath.contains(File.separator)) {
        fmd.setDirectoryLabel(relativePath.replace(File.separator + file.getName(), ""));
    }
    fmd.setDataFile(datafile);
    datafile.getFileMetadatas().add(fmd);
    if (version.getFileMetadatas() == null)
        version.setFileMetadatas(new ArrayList<>());
    version.getFileMetadatas().add(fmd);
    fmd.setDatasetVersion(version);
    datafile = dataFileServiceBean.save(datafile);
    return datafile;
}
Also used : DataFile(edu.harvard.iq.dataverse.DataFile) ChecksumType(edu.harvard.iq.dataverse.DataFile.ChecksumType) FileMetadata(edu.harvard.iq.dataverse.FileMetadata) ArrayList(java.util.ArrayList) DatasetVersion(edu.harvard.iq.dataverse.DatasetVersion) Timestamp(java.sql.Timestamp) HashMap(java.util.HashMap) Map(java.util.Map) Date(java.util.Date)

Example 45 with FileMetadata

use of edu.harvard.iq.dataverse.FileMetadata in project dataverse by IQSS.

the class FileRecordWriter method createPackageDataFile.

/**
 * Import the supplied batch of files as a single "package file" DataFile
 * (basically, a folder/directory, with the single associated DataFile/FileMetadata, etc.)
 * and add it to the
 * latest dataset version
 * @param files list of files, already copied to the dataset directory by rsync or otherwise.
 * @return datafile
 *
 * Consider:
 * instead of expecting to have an extra top-level directory/folder to be
 * present already, generate it here (using the standard code used for generating
 * storage identifiers for "normal" files), create it as a directory, and move
 * all the supplied files there.l
 */
private DataFile createPackageDataFile(List<File> files) {
    DataFile packageFile = new DataFile(DataFileServiceBean.MIME_TYPE_PACKAGE_FILE);
    FileUtil.generateStorageIdentifier(packageFile);
    String datasetDirectory = null;
    String folderName = null;
    long totalSize;
    if (suppliedSize != null) {
        totalSize = suppliedSize;
    } else {
        totalSize = 0L;
    }
    String gid = dataset.getAuthority() + dataset.getDoiSeparator() + dataset.getIdentifier();
    // initial default
    packageFile.setChecksumType(DataFile.ChecksumType.SHA1);
    // check system property first, otherwise use the batch job property:
    String jobChecksumType;
    if (System.getProperty("checksumType") != null) {
        jobChecksumType = System.getProperty("checksumType");
    } else {
        jobChecksumType = checksumType;
    }
    for (DataFile.ChecksumType type : DataFile.ChecksumType.values()) {
        if (jobChecksumType.equalsIgnoreCase(type.name())) {
            packageFile.setChecksumType(type);
            break;
        }
    }
    for (File file : files) {
        String path = file.getAbsolutePath();
        String relativePath = path.substring(path.indexOf(gid) + gid.length() + 1);
        // the folderName and datasetDirectory need to be initialized only once:
        if (datasetDirectory == null && folderName == null) {
            datasetDirectory = path.substring(0, path.indexOf(gid) + gid.length() + 1);
            if (relativePath != null && relativePath.indexOf(File.separatorChar) > -1) {
                folderName = relativePath.substring(0, relativePath.indexOf(File.separatorChar));
            } else {
                getJobLogger().log(Level.SEVERE, "Invalid file package (files are not in a folder)");
                jobContext.setExitStatus("FAILED");
                return null;
            }
            if (!uploadFolder.equals(folderName)) {
                getJobLogger().log(Level.SEVERE, "Folder name mismatch: " + uploadFolder + " expected, " + folderName + " found.");
                jobContext.setExitStatus("FAILED");
                return null;
            }
        }
        if (suppliedSize == null) {
            totalSize += file.length();
        }
        String checksumValue;
        // lookup the checksum value in the job's manifest hashmap
        if (jobContext.getTransientUserData() != null) {
            String manifestPath = relativePath.substring(folderName.length() + 1);
            checksumValue = ((Map<String, String>) jobContext.getTransientUserData()).get(manifestPath);
            if (checksumValue != null) {
                // remove the key, so we can check for unused checksums when the job is complete
                ((Map<String, String>) jobContext.getTransientUserData()).remove(manifestPath);
            } else {
                getJobLogger().log(Level.WARNING, "Unable to find checksum in manifest for: " + file.getAbsolutePath());
            }
        } else {
            getJobLogger().log(Level.SEVERE, "No checksum hashmap found in transientUserData");
            jobContext.setExitStatus("FAILED");
            return null;
        }
    }
    if (System.getProperty("checksumManifest") != null) {
        checksumManifest = System.getProperty("checksumManifest");
    }
    File checksumManifestFile = null;
    if (checksumManifest != null && !checksumManifest.isEmpty()) {
        String checksumManifestPath = datasetDirectory + File.separator + folderName + File.separator + checksumManifest;
        checksumManifestFile = new File(checksumManifestPath);
        if (!checksumManifestFile.exists()) {
            getJobLogger().log(Level.WARNING, "Manifest file not found");
        // TODO:
        // add code to generate the manifest, if not present? -- L.A.
        } else {
            try {
                packageFile.setChecksumValue(FileUtil.CalculateCheckSum(checksumManifestPath, packageFile.getChecksumType()));
            } catch (Exception ex) {
                getJobLogger().log(Level.SEVERE, "Failed to calculate checksum (type " + packageFile.getChecksumType() + ") " + ex.getMessage());
                jobContext.setExitStatus("FAILED");
                return null;
            }
        }
    } else {
        getJobLogger().log(Level.WARNING, "No checksumManifest property supplied");
    }
    // Move the folder to the final destination:
    if (!(new File(datasetDirectory + File.separator + folderName).renameTo(new File(datasetDirectory + File.separator + packageFile.getStorageIdentifier())))) {
        getJobLogger().log(Level.SEVERE, "Could not move the file folder to the final destination (" + datasetDirectory + File.separator + packageFile.getStorageIdentifier() + ")");
        jobContext.setExitStatus("FAILED");
        return null;
    }
    packageFile.setFilesize(totalSize);
    packageFile.setModificationTime(new Timestamp(new Date().getTime()));
    packageFile.setCreateDate(new Timestamp(new Date().getTime()));
    packageFile.setPermissionModificationTime(new Timestamp(new Date().getTime()));
    packageFile.setOwner(dataset);
    dataset.getFiles().add(packageFile);
    packageFile.setIngestDone();
    // set metadata and add to latest version
    FileMetadata fmd = new FileMetadata();
    fmd.setLabel(folderName);
    fmd.setDataFile(packageFile);
    packageFile.getFileMetadatas().add(fmd);
    if (dataset.getLatestVersion().getFileMetadatas() == null)
        dataset.getLatestVersion().setFileMetadatas(new ArrayList<>());
    dataset.getLatestVersion().getFileMetadatas().add(fmd);
    fmd.setDatasetVersion(dataset.getLatestVersion());
    getJobLogger().log(Level.INFO, "Successfully created a file of type package");
    return packageFile;
}
Also used : DataFile(edu.harvard.iq.dataverse.DataFile) ChecksumType(edu.harvard.iq.dataverse.DataFile.ChecksumType) FileMetadata(edu.harvard.iq.dataverse.FileMetadata) ArrayList(java.util.ArrayList) DataFile(edu.harvard.iq.dataverse.DataFile) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map) Timestamp(java.sql.Timestamp) CommandException(edu.harvard.iq.dataverse.engine.command.exception.CommandException) IOException(java.io.IOException) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) Date(java.util.Date)

Aggregations

FileMetadata (edu.harvard.iq.dataverse.FileMetadata)54 DataFile (edu.harvard.iq.dataverse.DataFile)30 DatasetVersion (edu.harvard.iq.dataverse.DatasetVersion)26 ArrayList (java.util.ArrayList)23 Dataset (edu.harvard.iq.dataverse.Dataset)18 Test (org.junit.Test)13 Date (java.util.Date)12 IOException (java.io.IOException)10 Timestamp (java.sql.Timestamp)10 DataTable (edu.harvard.iq.dataverse.DataTable)5 DatasetField (edu.harvard.iq.dataverse.DatasetField)5 AuthenticatedUser (edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser)5 MocksFactory.makeDataset (edu.harvard.iq.dataverse.mocks.MocksFactory.makeDataset)5 SimpleDateFormat (java.text.SimpleDateFormat)5 HashMap (java.util.HashMap)5 Dataverse (edu.harvard.iq.dataverse.Dataverse)4 File (java.io.File)4 FileNotFoundException (java.io.FileNotFoundException)4 JsonObjectBuilder (javax.json.JsonObjectBuilder)4 DataFileTag (edu.harvard.iq.dataverse.DataFileTag)3