Search in sources :

Example 21 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class RJobRequest method getVariableFormats.

/**
 * Getter for property variable formats
 *
 * @return    A Map that maps a format to
 *            its corresponding type, either time or date
 */
public Map<String, String> getVariableFormats() {
    Map<String, String> variableFormats = new LinkedHashMap<>();
    for (int i = 0; i < dataVariablesForRequest.size(); i++) {
        DataVariable dv = dataVariablesForRequest.get(i);
        // dbgLog.fine(String.format("DvnRJobRequest: column[%d] schema = %s", i, dv.getFormatSchema()));
        dbgLog.fine(String.format("DvnRJobRequest: column[%d] category = %s", i, dv.getFormatCategory()));
        // experiment dbgLog.fine(i+"-th \tformatschema="+dv.getFormatSchema());
        dbgLog.fine(i + "-th \tformatcategory=" + dv.getFormatCategory());
        if (!StringUtils.isEmpty(dv.getFormatCategory())) {
            // if (dv.getFormatSchema().toLowerCase().equals("spss")){
            if (dv.getDataTable().getOriginalFileFormat().toLowerCase().startsWith("application/x-spss")) {
                if (dv.getFormatCategory().toLowerCase().equals("date")) {
                    // are not going to be treated as dates)
                    if ("yyyy-MM-dd".equals(dv.getFormat())) {
                        variableFormats.put(getSafeVariableName(dv.getName()), "D");
                    }
                } else if (dv.getFormatCategory().toLowerCase().equals("time")) {
                    // add this var to this map
                    if (dv.getFormatCategory().toLowerCase().startsWith("dtime")) {
                        // value JT
                        variableFormats.put(getSafeVariableName(dv.getName()), "JT");
                    } else if (dv.getFormatCategory().toLowerCase().startsWith("datetime")) {
                        // value DT
                        variableFormats.put(getSafeVariableName(dv.getName()), "DT");
                    } else {
                        // value T
                        variableFormats.put(getSafeVariableName(dv.getName()), "T");
                    }
                }
            } else // else if (dv.getFormatSchema().toLowerCase().equals("rdata")) {
            if (dv.getDataTable().getOriginalFileFormat().toLowerCase().startsWith("application/x-rlang-transport")) {
                // TODO: double-check that this is what we save for the original format!!
                if (dv.getFormatCategory().toLowerCase().equals("date")) {
                    // are not going to be treated as dates)
                    if ("yyyy-MM-dd".equals(dv.getFormat())) {
                        variableFormats.put(getSafeVariableName(dv.getName()), "D");
                    }
                } else if (dv.getFormatCategory().toLowerCase().equals("time")) {
                    // add this var to this map
                    if (dv.getFormatCategory().toLowerCase().startsWith("dtime")) {
                        // value JT
                        variableFormats.put(getSafeVariableName(dv.getName()), "JT");
                    } else if (dv.getFormatCategory().toLowerCase().startsWith("datetime")) {
                        // Set as date-time-timezone, DT
                        variableFormats.put(getSafeVariableName(dv.getName()), "DT");
                    } else if (dv.getFormatCategory().toLowerCase().startsWith("time")) {
                        // Set as date-time-timezone, DT
                        variableFormats.put(getSafeVariableName(dv.getName()), "DT");
                    } else {
                        // value T
                        variableFormats.put(getSafeVariableName(dv.getName()), "T");
                    }
                }
            } else /* if (dv.getFormatSchema().toLowerCase().equals("other")) ?? */
            {
                if (dv.getFormatCategory().toLowerCase().equals("date")) {
                    // are not going to be treated as dates)
                    if ("yyyy-MM-dd".equals(dv.getFormat())) {
                        variableFormats.put(getSafeVariableName(dv.getName()), "D");
                    }
                }
            }
        // TODO: (?)
        // What about STATA? -- L.A.
        } else {
            dbgLog.fine(i + "\t var: not date or time variable");
        }
    }
    dbgLog.fine("format=" + variableFormats);
    return variableFormats;
}
Also used : DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable)

Example 22 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class IndexServiceBean method addOrUpdateDataset.

private String addOrUpdateDataset(IndexableDataset indexableDataset) {
    IndexableDataset.DatasetState state = indexableDataset.getDatasetState();
    Dataset dataset = indexableDataset.getDatasetVersion().getDataset();
    logger.fine("adding or updating Solr document for dataset id " + dataset.getId());
    Collection<SolrInputDocument> docs = new ArrayList<>();
    List<String> dataversePathSegmentsAccumulator = new ArrayList<>();
    List<String> dataverseSegments = new ArrayList<>();
    try {
        dataverseSegments = findPathSegments(dataset.getOwner(), dataversePathSegmentsAccumulator);
    } catch (Exception ex) {
        logger.info("failed to find dataverseSegments for dataversePaths for " + SearchFields.SUBTREE + ": " + ex);
    }
    List<String> dataversePaths = getDataversePathsFromSegments(dataverseSegments);
    // Add Paths for linking dataverses
    for (Dataverse linkingDataverse : dsLinkingService.findLinkingDataverses(dataset.getId())) {
        List<String> linkingDataversePathSegmentsAccumulator = new ArrayList<>();
        List<String> linkingdataverseSegments = findPathSegments(linkingDataverse, linkingDataversePathSegmentsAccumulator);
        List<String> linkingDataversePaths = getDataversePathsFromSegments(linkingdataverseSegments);
        for (String dvPath : linkingDataversePaths) {
            dataversePaths.add(dvPath);
        }
    }
    SolrInputDocument solrInputDocument = new SolrInputDocument();
    String datasetSolrDocId = indexableDataset.getSolrDocId();
    solrInputDocument.addField(SearchFields.ID, datasetSolrDocId);
    solrInputDocument.addField(SearchFields.ENTITY_ID, dataset.getId());
    String dataverseVersion = systemConfig.getVersion();
    solrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion);
    solrInputDocument.addField(SearchFields.IDENTIFIER, dataset.getGlobalId());
    solrInputDocument.addField(SearchFields.DATASET_PERSISTENT_ID, dataset.getGlobalId());
    solrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL());
    solrInputDocument.addField(SearchFields.TYPE, "datasets");
    Date datasetSortByDate = new Date();
    Date majorVersionReleaseDate = dataset.getMostRecentMajorVersionReleaseDate();
    if (majorVersionReleaseDate != null) {
        if (true) {
            String msg = "major release date found: " + majorVersionReleaseDate.toString();
            logger.fine(msg);
        }
        datasetSortByDate = majorVersionReleaseDate;
    } else {
        if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.WORKING_COPY)) {
            solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING);
        } else if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.DEACCESSIONED)) {
            solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DEACCESSIONED_STRING);
        }
        Date createDate = dataset.getCreateDate();
        if (createDate != null) {
            if (true) {
                String msg = "can't find major release date, using create date: " + createDate;
                logger.fine(msg);
            }
            datasetSortByDate = createDate;
        } else {
            String msg = "can't find major release date or create date, using \"now\"";
            logger.info(msg);
            datasetSortByDate = new Date();
        }
    }
    solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, datasetSortByDate);
    solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(datasetSortByDate));
    if (state.equals(indexableDataset.getDatasetState().PUBLISHED)) {
        solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING);
    // solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataset.getPublicationDate());
    } else if (state.equals(indexableDataset.getDatasetState().WORKING_COPY)) {
        solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING);
    }
    addDatasetReleaseDateToSolrDoc(solrInputDocument, dataset);
    if (dataset.isHarvested()) {
        solrInputDocument.addField(SearchFields.IS_HARVESTED, true);
        solrInputDocument.addField(SearchFields.METADATA_SOURCE, HARVESTED);
    } else {
        solrInputDocument.addField(SearchFields.IS_HARVESTED, false);
        // rootDataverseName);
        solrInputDocument.addField(SearchFields.METADATA_SOURCE, findRootDataverseCached().getName());
    }
    DatasetVersion datasetVersion = indexableDataset.getDatasetVersion();
    String parentDatasetTitle = "TBD";
    if (datasetVersion != null) {
        solrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId());
        solrInputDocument.addField(SearchFields.DATASET_CITATION, datasetVersion.getCitation(false));
        solrInputDocument.addField(SearchFields.DATASET_CITATION_HTML, datasetVersion.getCitation(true));
        if (datasetVersion.isInReview()) {
            solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, IN_REVIEW_STRING);
        }
        for (DatasetField dsf : datasetVersion.getFlatDatasetFields()) {
            DatasetFieldType dsfType = dsf.getDatasetFieldType();
            String solrFieldSearchable = dsfType.getSolrField().getNameSearchable();
            String solrFieldFacetable = dsfType.getSolrField().getNameFacetable();
            if (dsf.getValues() != null && !dsf.getValues().isEmpty() && dsf.getValues().get(0) != null && solrFieldSearchable != null) {
                logger.fine("indexing " + dsf.getDatasetFieldType().getName() + ":" + dsf.getValues() + " into " + solrFieldSearchable + " and maybe " + solrFieldFacetable);
                // if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.INTEGER)) {
                if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.EMAIL)) {
                // no-op. we want to keep email address out of Solr per https://github.com/IQSS/dataverse/issues/759
                } else if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.DATE)) {
                    String dateAsString = dsf.getValues().get(0);
                    logger.fine("date as string: " + dateAsString);
                    if (dateAsString != null && !dateAsString.isEmpty()) {
                        SimpleDateFormat inputDateyyyy = new SimpleDateFormat("yyyy", Locale.ENGLISH);
                        try {
                            /**
                             * @todo when bean validation is working we
                             * won't have to convert strings into dates
                             */
                            logger.fine("Trying to convert " + dateAsString + " to a YYYY date from dataset " + dataset.getId());
                            Date dateAsDate = inputDateyyyy.parse(dateAsString);
                            SimpleDateFormat yearOnly = new SimpleDateFormat("yyyy");
                            String datasetFieldFlaggedAsDate = yearOnly.format(dateAsDate);
                            logger.fine("YYYY only: " + datasetFieldFlaggedAsDate);
                            // solrInputDocument.addField(solrFieldSearchable, Integer.parseInt(datasetFieldFlaggedAsDate));
                            solrInputDocument.addField(solrFieldSearchable, datasetFieldFlaggedAsDate);
                            if (dsfType.getSolrField().isFacetable()) {
                                // solrInputDocument.addField(solrFieldFacetable, Integer.parseInt(datasetFieldFlaggedAsDate));
                                solrInputDocument.addField(solrFieldFacetable, datasetFieldFlaggedAsDate);
                            }
                        } catch (Exception ex) {
                            logger.info("unable to convert " + dateAsString + " into YYYY format and couldn't index it (" + dsfType.getName() + ")");
                        }
                    }
                } else {
                    if (dsf.getDatasetFieldType().getName().equals("authorAffiliation")) {
                        /**
                         * @todo think about how to tie the fact that this
                         * needs to be multivalued (_ss) because a
                         * multivalued facet (authorAffilition_ss) is being
                         * collapsed into here at index time. The business
                         * logic to determine if a data-driven metadata
                         * field should be indexed into Solr as a single or
                         * multiple value lives in the getSolrField() method
                         * of DatasetField.java
                         */
                        solrInputDocument.addField(SearchFields.AFFILIATION, dsf.getValuesWithoutNaValues());
                    } else if (dsf.getDatasetFieldType().getName().equals("title")) {
                        // datasets have titles not names but index title under name as well so we can sort datasets by name along dataverses and files
                        List<String> possibleTitles = dsf.getValues();
                        String firstTitle = possibleTitles.get(0);
                        if (firstTitle != null) {
                            parentDatasetTitle = firstTitle;
                        }
                        solrInputDocument.addField(SearchFields.NAME_SORT, dsf.getValues());
                    }
                    if (dsfType.isControlledVocabulary()) {
                        for (ControlledVocabularyValue controlledVocabularyValue : dsf.getControlledVocabularyValues()) {
                            if (controlledVocabularyValue.getStrValue().equals(DatasetField.NA_VALUE)) {
                                continue;
                            }
                            solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getStrValue());
                            if (dsfType.getSolrField().isFacetable()) {
                                solrInputDocument.addField(solrFieldFacetable, controlledVocabularyValue.getStrValue());
                            }
                        }
                    } else if (dsfType.getFieldType().equals(DatasetFieldType.FieldType.TEXTBOX)) {
                        // strip HTML
                        List<String> htmlFreeText = StringUtil.htmlArray2textArray(dsf.getValuesWithoutNaValues());
                        solrInputDocument.addField(solrFieldSearchable, htmlFreeText);
                        if (dsfType.getSolrField().isFacetable()) {
                            solrInputDocument.addField(solrFieldFacetable, htmlFreeText);
                        }
                    } else {
                        // do not strip HTML
                        solrInputDocument.addField(solrFieldSearchable, dsf.getValuesWithoutNaValues());
                        if (dsfType.getSolrField().isFacetable()) {
                            if (dsf.getDatasetFieldType().getName().equals(DatasetFieldConstant.topicClassValue)) {
                                String topicClassificationTerm = getTopicClassificationTermOrTermAndVocabulary(dsf);
                                if (topicClassificationTerm != null) {
                                    logger.fine(solrFieldFacetable + " gets " + topicClassificationTerm);
                                    solrInputDocument.addField(solrFieldFacetable, topicClassificationTerm);
                                }
                            } else {
                                solrInputDocument.addField(solrFieldFacetable, dsf.getValuesWithoutNaValues());
                            }
                        }
                    }
                }
            }
        }
    }
    solrInputDocument.addField(SearchFields.SUBTREE, dataversePaths);
    // solrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataset.getOwner().getName());
    solrInputDocument.addField(SearchFields.PARENT_ID, dataset.getOwner().getId());
    solrInputDocument.addField(SearchFields.PARENT_NAME, dataset.getOwner().getName());
    if (state.equals(indexableDataset.getDatasetState().DEACCESSIONED)) {
        String deaccessionNote = datasetVersion.getVersionNote();
        if (deaccessionNote != null) {
            solrInputDocument.addField(SearchFields.DATASET_DEACCESSION_REASON, deaccessionNote);
        }
    }
    docs.add(solrInputDocument);
    List<String> filesIndexed = new ArrayList<>();
    if (datasetVersion != null) {
        List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas();
        boolean checkForDuplicateMetadata = false;
        if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) {
            checkForDuplicateMetadata = true;
            logger.fine("We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions.");
        }
        for (FileMetadata fileMetadata : fileMetadatas) {
            boolean indexThisMetadata = true;
            if (checkForDuplicateMetadata) {
                logger.fine("Checking if this file metadata is a duplicate.");
                for (FileMetadata releasedFileMetadata : dataset.getReleasedVersion().getFileMetadatas()) {
                    if (fileMetadata.getDataFile() != null && fileMetadata.getDataFile().equals(releasedFileMetadata.getDataFile())) {
                        if (fileMetadata.contentEquals(releasedFileMetadata)) {
                            indexThisMetadata = false;
                            logger.fine("This file metadata hasn't changed since the released version; skipping indexing.");
                        } else {
                            logger.fine("This file metadata has changed since the released version; we want to index it!");
                        }
                        break;
                    }
                }
            }
            if (indexThisMetadata) {
                SolrInputDocument datafileSolrInputDocument = new SolrInputDocument();
                Long fileEntityId = fileMetadata.getDataFile().getId();
                datafileSolrInputDocument.addField(SearchFields.ENTITY_ID, fileEntityId);
                datafileSolrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion);
                datafileSolrInputDocument.addField(SearchFields.IDENTIFIER, fileEntityId);
                datafileSolrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL());
                datafileSolrInputDocument.addField(SearchFields.TYPE, "files");
                String filenameCompleteFinal = "";
                if (fileMetadata != null) {
                    String filenameComplete = fileMetadata.getLabel();
                    if (filenameComplete != null) {
                        String filenameWithoutExtension = "";
                        // String extension = "";
                        int i = filenameComplete.lastIndexOf('.');
                        if (i > 0) {
                            // extension = filenameComplete.substring(i + 1);
                            try {
                                filenameWithoutExtension = filenameComplete.substring(0, i);
                                datafileSolrInputDocument.addField(SearchFields.FILENAME_WITHOUT_EXTENSION, filenameWithoutExtension);
                                datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameWithoutExtension);
                            } catch (IndexOutOfBoundsException ex) {
                                filenameWithoutExtension = "";
                            }
                        } else {
                            logger.fine("problem with filename '" + filenameComplete + "': no extension? empty string as filename?");
                            filenameWithoutExtension = filenameComplete;
                        }
                        filenameCompleteFinal = filenameComplete;
                    }
                    for (String tag : fileMetadata.getCategoriesByName()) {
                        datafileSolrInputDocument.addField(SearchFields.FILE_TAG, tag);
                        datafileSolrInputDocument.addField(SearchFields.FILE_TAG_SEARCHABLE, tag);
                    }
                }
                datafileSolrInputDocument.addField(SearchFields.NAME, filenameCompleteFinal);
                datafileSolrInputDocument.addField(SearchFields.NAME_SORT, filenameCompleteFinal);
                datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameCompleteFinal);
                datafileSolrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId());
                /**
                 * for rules on sorting files see
                 * https://docs.google.com/a/harvard.edu/document/d/1DWsEqT8KfheKZmMB3n_VhJpl9nIxiUjai_AIQPAjiyA/edit?usp=sharing
                 * via https://redmine.hmdc.harvard.edu/issues/3701
                 */
                Date fileSortByDate = new Date();
                DataFile datafile = fileMetadata.getDataFile();
                if (datafile != null) {
                    boolean fileHasBeenReleased = datafile.isReleased();
                    if (fileHasBeenReleased) {
                        logger.fine("indexing file with filePublicationTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")");
                        Timestamp filePublicationTimestamp = datafile.getPublicationDate();
                        if (filePublicationTimestamp != null) {
                            fileSortByDate = filePublicationTimestamp;
                        } else {
                            String msg = "filePublicationTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")";
                            logger.info(msg);
                        }
                        datafileSolrInputDocument.addField(SearchFields.ACCESS, datafile.isRestricted() ? SearchConstants.RESTRICTED : SearchConstants.PUBLIC);
                    } else {
                        logger.fine("indexing file with fileCreateTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")");
                        Timestamp fileCreateTimestamp = datafile.getCreateDate();
                        if (fileCreateTimestamp != null) {
                            fileSortByDate = fileCreateTimestamp;
                        } else {
                            String msg = "fileCreateTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")";
                            logger.info(msg);
                        }
                        datafileSolrInputDocument.addField(SearchFields.ACCESS, fileMetadata.isRestricted() ? SearchConstants.RESTRICTED : SearchConstants.PUBLIC);
                    }
                    if (datafile.isHarvested()) {
                        datafileSolrInputDocument.addField(SearchFields.IS_HARVESTED, true);
                        datafileSolrInputDocument.addField(SearchFields.METADATA_SOURCE, HARVESTED);
                    } else {
                        datafileSolrInputDocument.addField(SearchFields.IS_HARVESTED, false);
                        datafileSolrInputDocument.addField(SearchFields.METADATA_SOURCE, findRootDataverseCached().getName());
                    }
                }
                if (fileSortByDate == null) {
                    if (datasetSortByDate != null) {
                        logger.info("fileSortByDate was null, assigning datasetSortByDate");
                        fileSortByDate = datasetSortByDate;
                    } else {
                        logger.info("fileSortByDate and datasetSortByDate were null, assigning 'now'");
                        fileSortByDate = new Date();
                    }
                }
                datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, fileSortByDate);
                datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(fileSortByDate));
                if (majorVersionReleaseDate == null && !datafile.isHarvested()) {
                    datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING);
                }
                if (datasetVersion.isInReview()) {
                    datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, IN_REVIEW_STRING);
                }
                String fileSolrDocId = solrDocIdentifierFile + fileEntityId;
                if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().PUBLISHED)) {
                    fileSolrDocId = solrDocIdentifierFile + fileEntityId;
                    datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING);
                    // datafileSolrInputDocument.addField(SearchFields.PERMS, publicGroupString);
                    addDatasetReleaseDateToSolrDoc(datafileSolrInputDocument, dataset);
                } else if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().WORKING_COPY)) {
                    fileSolrDocId = solrDocIdentifierFile + fileEntityId + indexableDataset.getDatasetState().getSuffix();
                    datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING);
                }
                datafileSolrInputDocument.addField(SearchFields.ID, fileSolrDocId);
                datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_FRIENDLY, fileMetadata.getDataFile().getFriendlyType());
                datafileSolrInputDocument.addField(SearchFields.FILE_CONTENT_TYPE, fileMetadata.getDataFile().getContentType());
                datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, fileMetadata.getDataFile().getFriendlyType());
                // For the file type facets, we have a property file that maps mime types
                // to facet-friendly names; "application/fits" should become "FITS", etc.:
                datafileSolrInputDocument.addField(SearchFields.FILE_TYPE, FileUtil.getFacetFileType(fileMetadata.getDataFile()));
                datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getFacetFileType(fileMetadata.getDataFile()));
                datafileSolrInputDocument.addField(SearchFields.FILE_SIZE_IN_BYTES, fileMetadata.getDataFile().getFilesize());
                if (DataFile.ChecksumType.MD5.equals(fileMetadata.getDataFile().getChecksumType())) {
                    /**
                     * @todo Someday we should probably deprecate this
                     * FILE_MD5 in favor of a combination of
                     * FILE_CHECKSUM_TYPE and FILE_CHECKSUM_VALUE.
                     */
                    datafileSolrInputDocument.addField(SearchFields.FILE_MD5, fileMetadata.getDataFile().getChecksumValue());
                }
                datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_TYPE, fileMetadata.getDataFile().getChecksumType().toString());
                datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_VALUE, fileMetadata.getDataFile().getChecksumValue());
                datafileSolrInputDocument.addField(SearchFields.DESCRIPTION, fileMetadata.getDescription());
                datafileSolrInputDocument.addField(SearchFields.FILE_DESCRIPTION, fileMetadata.getDescription());
                datafileSolrInputDocument.addField(SearchFields.UNF, fileMetadata.getDataFile().getUnf());
                datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths);
                // datafileSolrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataFile.getOwner().getOwner().getName());
                // datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, dataFile.getDataset().getTitle());
                datafileSolrInputDocument.addField(SearchFields.PARENT_ID, fileMetadata.getDataFile().getOwner().getId());
                datafileSolrInputDocument.addField(SearchFields.PARENT_IDENTIFIER, fileMetadata.getDataFile().getOwner().getGlobalId());
                datafileSolrInputDocument.addField(SearchFields.PARENT_CITATION, fileMetadata.getDataFile().getOwner().getCitation());
                datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, parentDatasetTitle);
                // names and labels:
                if (fileMetadata.getDataFile().isTabularData()) {
                    List<DataVariable> variables = fileMetadata.getDataFile().getDataTable().getDataVariables();
                    for (DataVariable var : variables) {
                        if (var.getName() != null && !var.getName().equals("")) {
                            datafileSolrInputDocument.addField(SearchFields.VARIABLE_NAME, var.getName());
                        }
                        if (var.getLabel() != null && !var.getLabel().equals("")) {
                            datafileSolrInputDocument.addField(SearchFields.VARIABLE_LABEL, var.getLabel());
                        }
                    }
                    // (not to be confused with the file categories, indexed above!)
                    for (DataFileTag tag : fileMetadata.getDataFile().getTags()) {
                        String tagLabel = tag.getTypeLabel();
                        datafileSolrInputDocument.addField(SearchFields.TABDATA_TAG, tagLabel);
                    }
                }
                if (indexableDataset.isFilesShouldBeIndexed()) {
                    filesIndexed.add(fileSolrDocId);
                    docs.add(datafileSolrInputDocument);
                }
            }
        }
    }
    try {
        solrServer.add(docs);
    } catch (SolrServerException | IOException ex) {
        return ex.toString();
    }
    try {
        solrServer.commit();
    } catch (SolrServerException | IOException ex) {
        return ex.toString();
    }
    Long dsId = dataset.getId();
    // /Dataset updatedDataset = (Dataset)dvObjectService.updateContentIndexTime(dataset);
    // /updatedDataset = null;
    // instead of making a call to dvObjectService, let's try and
    // modify the index time stamp using the local EntityManager:
    DvObject dvObjectToModify = em.find(DvObject.class, dsId);
    dvObjectToModify.setIndexTime(new Timestamp(new Date().getTime()));
    dvObjectToModify = em.merge(dvObjectToModify);
    dvObjectToModify = null;
    // return "indexed dataset " + dataset.getId() + " as " + solrDocId + "\nindexFilesResults for " + solrDocId + ":" + fileInfo.toString();
    return "indexed dataset " + dsId + " as " + datasetSolrDocId + ". filesIndexed: " + filesIndexed;
}
Also used : DatasetField(edu.harvard.iq.dataverse.DatasetField) DvObject(edu.harvard.iq.dataverse.DvObject) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ArrayList(java.util.ArrayList) FileMetadata(edu.harvard.iq.dataverse.FileMetadata) DatasetVersion(edu.harvard.iq.dataverse.DatasetVersion) DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable) Timestamp(java.sql.Timestamp) DataFile(edu.harvard.iq.dataverse.DataFile) SolrInputDocument(org.apache.solr.common.SolrInputDocument) ControlledVocabularyValue(edu.harvard.iq.dataverse.ControlledVocabularyValue) Dataset(edu.harvard.iq.dataverse.Dataset) IOException(java.io.IOException) Dataverse(edu.harvard.iq.dataverse.Dataverse) DatasetFieldType(edu.harvard.iq.dataverse.DatasetFieldType) SolrServerException(org.apache.solr.client.solrj.SolrServerException) EJBException(javax.ejb.EJBException) IOException(java.io.IOException) Date(java.util.Date) SimpleDateFormat(java.text.SimpleDateFormat) DataFileTag(edu.harvard.iq.dataverse.DataFileTag)

Example 23 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class FileAccessIO method open.

@Override
public void open(DataAccessOption... options) throws IOException {
    DataFile dataFile;
    Dataset dataset;
    Dataverse dataverse = null;
    DataAccessRequest req = this.getRequest();
    if (isWriteAccessRequested(options)) {
        isWriteAccess = true;
        isReadAccess = false;
    } else {
        isWriteAccess = false;
        isReadAccess = true;
    }
    if (dvObject instanceof DataFile) {
        dataFile = this.getDataFile();
        if (req != null && req.getParameter("noVarHeader") != null) {
            this.setNoVarHeader(true);
        }
        if (dataFile.getStorageIdentifier() == null || "".equals(dataFile.getStorageIdentifier())) {
            throw new IOException("Data Access: No local storage identifier defined for this datafile.");
        }
        if (isReadAccess) {
            FileInputStream fin = openLocalFileAsInputStream();
            if (fin == null) {
                throw new IOException("Failed to open local file " + getStorageLocation());
            }
            this.setInputStream(fin);
            setChannel(fin.getChannel());
            this.setSize(getLocalFileSize());
            if (dataFile.getContentType() != null && dataFile.getContentType().equals("text/tab-separated-values") && dataFile.isTabularData() && dataFile.getDataTable() != null && (!this.noVarHeader())) {
                List<DataVariable> datavariables = dataFile.getDataTable().getDataVariables();
                String varHeaderLine = generateVariableHeader(datavariables);
                this.setVarHeader(varHeaderLine);
            }
        } else if (isWriteAccess) {
            // Creates a new directory as needed for a dataset.
            if (dataFile.getOwner().getFileSystemDirectory() != null && !Files.exists(dataFile.getOwner().getFileSystemDirectory())) {
                Files.createDirectories(dataFile.getOwner().getFileSystemDirectory());
            }
            FileOutputStream fout = openLocalFileAsOutputStream();
            if (fout == null) {
                throw new IOException("Failed to open local file " + getStorageLocation() + " for writing.");
            }
            this.setOutputStream(fout);
            setChannel(fout.getChannel());
        }
        this.setMimeType(dataFile.getContentType());
        try {
            this.setFileName(dataFile.getFileMetadata().getLabel());
        } catch (Exception ex) {
            this.setFileName("unknown");
        }
    } else if (dvObject instanceof Dataset) {
        // This case is for uploading a dataset related auxiliary file
        // e.g. image thumbnails/metadata exports
        // TODO: do we really need to do anything here? should we return the dataset directory?
        dataset = this.getDataset();
        if (isReadAccess) {
        // TODO: Not necessary for dataset as there is no files associated with this
        // FileInputStream fin = openLocalFileAsInputStream();
        // Path path= dataset.getFileSystemDirectory();
        // if (path == null) {
        // throw new IOException("Failed to locate Dataset"+dataset.getIdentifier());
        // }
        // 
        // this.setInputStream(fin);
        } else if (isWriteAccess) {
            // this checks whether a directory for a dataset exists
            if (dataset.getFileSystemDirectory() != null && !Files.exists(dataset.getFileSystemDirectory())) {
                Files.createDirectories(dataset.getFileSystemDirectory());
            }
            dataset.setStorageIdentifier("file://" + dataset.getAuthority() + dataset.getDoiSeparator() + dataset.getIdentifier());
        }
    } else if (dvObject instanceof Dataverse) {
        dataverse = this.getDataverse();
    } else {
        throw new IOException("Data Access: Invalid DvObject type");
    }
    // This "status" is a leftover from 3.6; we don't have a use for it
    // in 4.0 yet; and we may not need it at all.
    // -- L.A. 4.0.2
    this.setStatus(200);
}
Also used : DataFile(edu.harvard.iq.dataverse.DataFile) Dataset(edu.harvard.iq.dataverse.Dataset) FileOutputStream(java.io.FileOutputStream) DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable) IOException(java.io.IOException) Dataverse(edu.harvard.iq.dataverse.Dataverse) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException)

Example 24 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class SwiftAccessIO method open.

@Override
public void open(DataAccessOption... options) throws IOException {
    DataAccessRequest req = this.getRequest();
    if (isWriteAccessRequested(options)) {
        isWriteAccess = true;
        isReadAccess = false;
    } else {
        isWriteAccess = false;
        isReadAccess = true;
    }
    if (dvObject instanceof DataFile) {
        DataFile dataFile = this.getDataFile();
        if (req != null && req.getParameter("noVarHeader") != null) {
            this.setNoVarHeader(true);
        }
        if (dataFile.getStorageIdentifier() == null || "".equals(dataFile.getStorageIdentifier())) {
            throw new IOException("Data Access: No local storage identifier defined for this datafile.");
        }
        if (isReadAccess) {
            InputStream fin = openSwiftFileAsInputStream();
            if (fin == null) {
                throw new IOException("Failed to open Swift file " + getStorageLocation());
            }
            this.setInputStream(fin);
            setChannel(Channels.newChannel(fin));
            if (dataFile.getContentType() != null && dataFile.getContentType().equals("text/tab-separated-values") && dataFile.isTabularData() && dataFile.getDataTable() != null && (!this.noVarHeader())) {
                List<DataVariable> datavariables = dataFile.getDataTable().getDataVariables();
                String varHeaderLine = generateVariableHeader(datavariables);
                this.setVarHeader(varHeaderLine);
            }
        } else if (isWriteAccess) {
            swiftFileObject = initializeSwiftFileObject(true);
        }
        this.setMimeType(dataFile.getContentType());
        try {
            this.setFileName(dataFile.getFileMetadata().getLabel());
        } catch (Exception ex) {
            this.setFileName("unknown");
        }
    } else if (dvObject instanceof Dataset) {
        // such as a dataset thumbnail or a metadata export
        if (isReadAccess) {
            // TODO: fix this
            InputStream fin = openSwiftFileAsInputStream();
            if (fin == null) {
                throw new IOException("Failed to open Swift file " + getStorageLocation());
            }
            this.setInputStream(fin);
        } else if (isWriteAccess) {
            swiftFileObject = initializeSwiftFileObject(true);
        }
    } else if (dvObject instanceof Dataverse) {
    } else {
        throw new IOException("Data Access: Invalid DvObject type");
    }
}
Also used : DataFile(edu.harvard.iq.dataverse.DataFile) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Dataset(edu.harvard.iq.dataverse.Dataset) DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable) IOException(java.io.IOException) Dataverse(edu.harvard.iq.dataverse.Dataverse) SignatureException(java.security.SignatureException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) InvalidKeyException(java.security.InvalidKeyException)

Example 25 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class StorageIOTest method testGenerateVariableHeader.

@Test
public void testGenerateVariableHeader() {
    DataVariable var = new DataVariable();
    var.setName("Random");
    @SuppressWarnings("unchecked") List<DataVariable> dvs = Arrays.asList(new DataVariable[] { var, var });
    assertEquals("Random	Random\n", instance.generateVariableHeader(dvs));
    assertEquals(null, instance.generateVariableHeader(null));
}
Also used : DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable) Test(org.junit.Test)

Aggregations

DataVariable (edu.harvard.iq.dataverse.datavariable.DataVariable)25 DataFile (edu.harvard.iq.dataverse.DataFile)8 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)5 Dataset (edu.harvard.iq.dataverse.Dataset)4 Dataverse (edu.harvard.iq.dataverse.Dataverse)4 FileInputStream (java.io.FileInputStream)4 FileMetadata (edu.harvard.iq.dataverse.FileMetadata)3 VariableCategory (edu.harvard.iq.dataverse.datavariable.VariableCategory)3 File (java.io.File)3 FileNotFoundException (java.io.FileNotFoundException)3 InputStream (java.io.InputStream)3 DataTable (edu.harvard.iq.dataverse.DataTable)2 SummaryStatistic (edu.harvard.iq.dataverse.datavariable.SummaryStatistic)2 VariableRange (edu.harvard.iq.dataverse.datavariable.VariableRange)2 NoSuchAlgorithmException (java.security.NoSuchAlgorithmException)2 SimpleDateFormat (java.text.SimpleDateFormat)2 Date (java.util.Date)2 HashMap (java.util.HashMap)2 LinkedHashMap (java.util.LinkedHashMap)2