Search in sources :

Example 1 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class DataConverter method getValueTableForRequestedVariables.

private static Map<String, Map<String, String>> getValueTableForRequestedVariables(List<DataVariable> dvs) {
    Map<String, Map<String, String>> vls = new LinkedHashMap<>();
    for (DataVariable dv : dvs) {
        List<VariableCategory> varCat = new ArrayList<>();
        varCat.addAll(dv.getCategories());
        Map<String, String> vl = new HashMap<>();
        for (VariableCategory vc : varCat) {
            if (vc.getLabel() != null) {
                vl.put(vc.getValue(), vc.getLabel());
            }
        }
        if (vl.size() > 0) {
            vls.put("v" + dv.getId(), vl);
        }
    }
    return vls;
}
Also used : VariableCategory(edu.harvard.iq.dataverse.datavariable.VariableCategory) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap)

Example 2 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class DataConverter method runFormatConversion.

// end of performformatconversion();
// Method for (subsettable) file format conversion.
// The method needs the subsettable file saved on disk as in the
// TAB-delimited format.
// Meaning, if this is a remote subsettable file, it needs to be downloaded
// and stored locally as a temporary file; and if it's a fixed-field file, it
// needs to be converted to TAB-delimited, before you can feed the file
// to this method. (See performFormatConversion() method)
// The method below takes the tab file and sends it to the R server
// (possibly running on a remote host) and gets back the transformed copy,
// providing error-checking and diagnostics in the process.
// This is mostly Akio Sone's code from DVN3.
// (hence some obsolete elements in the comment above: ALL of the tabular
// data files in Dataverse are saved in tab-delimited format - we no longer
// support fixed-field files!
private static File runFormatConversion(DataFile file, File tabFile, String formatRequested) {
    if (formatRequested.equals(FILE_TYPE_TAB)) {
        return tabFile;
    }
    File formatConvertedFile = null;
    // create the service instance
    RemoteDataFrameService dfs = new RemoteDataFrameService();
    if ("RData".equals(formatRequested)) {
        List<DataVariable> dataVariables = file.getDataTable().getDataVariables();
        Map<String, Map<String, String>> vls = null;
        vls = getValueTableForRequestedVariables(dataVariables);
        logger.fine("format conversion: variables(getDataVariableForRequest())=" + dataVariables + "\n");
        logger.fine("format conversion: variables(dataVariables)=" + dataVariables + "\n");
        logger.fine("format conversion: value table(vls)=" + vls + "\n");
        RJobRequest sro = new RJobRequest(dataVariables, vls);
        sro.setTabularDataFileName(tabFile.getAbsolutePath());
        sro.setRequestType(SERVICE_REQUEST_CONVERT);
        sro.setFormatRequested(FILE_TYPE_RDATA);
        // execute the service
        Map<String, String> resultInfo = dfs.execute(sro);
        // resultInfo.put("offlineCitation", citation);
        logger.fine("resultInfo=" + resultInfo + "\n");
        if ("true".equals(resultInfo.get("RexecError"))) {
            logger.fine("R-runtime error trying to convert a file.");
            return null;
        } else {
            String dataFrameFileName = resultInfo.get("dataFrameFileName");
            logger.fine("data frame file name: " + dataFrameFileName);
            formatConvertedFile = new File(dataFrameFileName);
        }
    } else if ("prep".equals(formatRequested)) {
        formatConvertedFile = dfs.runDataPreprocessing(file);
    } else {
        logger.warning("Unsupported file format requested: " + formatRequested);
        return null;
    }
    if (formatConvertedFile != null && formatConvertedFile.exists()) {
        logger.fine("frmtCnvrtdFile:length=" + formatConvertedFile.length());
    } else {
        logger.warning("Format-converted file was not properly created.");
        return null;
    }
    return formatConvertedFile;
}
Also used : DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable) DataFile(edu.harvard.iq.dataverse.DataFile) File(java.io.File) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 3 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class S3AccessIO method open.

@Override
public void open(DataAccessOption... options) throws IOException {
    if (s3 == null) {
        throw new IOException("ERROR: s3 not initialised. ");
    }
    if (bucketName == null || !s3.doesBucketExist(bucketName)) {
        throw new IOException("ERROR: S3AccessIO - You must create and configure a bucket before creating datasets.");
    }
    DataAccessRequest req = this.getRequest();
    if (isWriteAccessRequested(options)) {
        isWriteAccess = true;
        isReadAccess = false;
    } else {
        isWriteAccess = false;
        isReadAccess = true;
    }
    if (dvObject instanceof DataFile) {
        String storageIdentifier = dvObject.getStorageIdentifier();
        DataFile dataFile = this.getDataFile();
        if (req != null && req.getParameter("noVarHeader") != null) {
            this.setNoVarHeader(true);
        }
        if (storageIdentifier == null || "".equals(storageIdentifier)) {
            throw new FileNotFoundException("Data Access: No local storage identifier defined for this datafile.");
        }
        if (isReadAccess) {
            key = getMainFileKey();
            S3Object s3object = s3.getObject(new GetObjectRequest(bucketName, key));
            InputStream in = s3object.getObjectContent();
            if (in == null) {
                throw new IOException("Cannot get Object" + key);
            }
            this.setInputStream(in);
            setChannel(Channels.newChannel(in));
            this.setSize(s3object.getObjectMetadata().getContentLength());
            if (dataFile.getContentType() != null && dataFile.getContentType().equals("text/tab-separated-values") && dataFile.isTabularData() && dataFile.getDataTable() != null && (!this.noVarHeader())) {
                List<DataVariable> datavariables = dataFile.getDataTable().getDataVariables();
                String varHeaderLine = generateVariableHeader(datavariables);
                this.setVarHeader(varHeaderLine);
            }
        } else if (isWriteAccess) {
            key = dataFile.getOwner().getAuthority() + "/" + this.getDataFile().getOwner().getIdentifier();
            if (storageIdentifier.startsWith(S3_IDENTIFIER_PREFIX + "://")) {
                key += "/" + storageIdentifier.substring(storageIdentifier.lastIndexOf(":") + 1);
            } else {
                key += "/" + storageIdentifier;
                dvObject.setStorageIdentifier(S3_IDENTIFIER_PREFIX + "://" + bucketName + ":" + storageIdentifier);
            }
        }
        this.setMimeType(dataFile.getContentType());
        try {
            this.setFileName(dataFile.getFileMetadata().getLabel());
        } catch (Exception ex) {
            this.setFileName("unknown");
        }
    } else if (dvObject instanceof Dataset) {
        Dataset dataset = this.getDataset();
        key = dataset.getAuthority() + "/" + dataset.getIdentifier();
        dataset.setStorageIdentifier(S3_IDENTIFIER_PREFIX + "://" + key);
    } else if (dvObject instanceof Dataverse) {
        throw new IOException("Data Access: Invalid DvObject type : Dataverse");
    } else {
        throw new IOException("Data Access: Invalid DvObject type");
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Dataset(edu.harvard.iq.dataverse.Dataset) FileNotFoundException(java.io.FileNotFoundException) DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable) IOException(java.io.IOException) Dataverse(edu.harvard.iq.dataverse.Dataverse) MultiObjectDeleteException(com.amazonaws.services.s3.model.MultiObjectDeleteException) FileNotFoundException(java.io.FileNotFoundException) AmazonClientException(com.amazonaws.AmazonClientException) IOException(java.io.IOException) SdkClientException(com.amazonaws.SdkClientException) DataFile(edu.harvard.iq.dataverse.DataFile) S3Object(com.amazonaws.services.s3.model.S3Object) GetObjectRequest(com.amazonaws.services.s3.model.GetObjectRequest)

Example 4 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class DTA117FileReader method readVariableTypes.

/* 
     * Variable type information is stored in the <variable_types>...</variable_types>
     * section, as number_of_variables * 2 byte values. 
     * the type codes are defined as follows: 
     * (TODO: ...)
    */
private void readVariableTypes(DataReader reader) throws IOException {
    // TODO:
    // check that we are at the right byte offset!
    logger.fine("Type section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_types());
    reader.readOpeningTag(TAG_VARIABLE_TYPES);
    List<DataVariable> variableList = new ArrayList<DataVariable>();
    // setup variableTypeList
    variableTypes = new String[dataTable.getVarQuantity().intValue()];
    for (int i = 0; i < dataTable.getVarQuantity(); i++) {
        int type = reader.readShortInteger();
        logger.fine("variable " + i + ": type=" + type);
        DataVariable dv = new DataVariable();
        dv.setInvalidRanges(new ArrayList<VariableRange>());
        dv.setSummaryStatistics(new ArrayList<SummaryStatistic>());
        dv.setCategories(new ArrayList<VariableCategory>());
        dv.setUnf("UNF:pending");
        dv.setFileOrder(i);
        dv.setDataTable(dataTable);
        variableTypes[i] = configureVariableType(dv, type);
        // TODO:
        // we could also calculate the byte offset table now, rather
        // then figure it out later... - ?
        variableList.add(dv);
    }
    reader.readClosingTag(TAG_VARIABLE_TYPES);
    dataTable.setDataVariables(variableList);
}
Also used : VariableCategory(edu.harvard.iq.dataverse.datavariable.VariableCategory) VariableRange(edu.harvard.iq.dataverse.datavariable.VariableRange) SummaryStatistic(edu.harvard.iq.dataverse.datavariable.SummaryStatistic) DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable)

Example 5 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class DTAFileReader method decodeHeader.

private void decodeHeader(BufferedInputStream stream) throws IOException {
    dbgLog.fine("***** decodeHeader(): start *****");
    if (stream == null) {
        throw new IllegalArgumentException("stream == null!");
    }
    dbgLog.fine("reading the header segument 1: 4 byte\n");
    byte[] magic_number = new byte[DTA_MAGIC_NUMBER_LENGTH];
    int nbytes = stream.read(magic_number, 0, DTA_MAGIC_NUMBER_LENGTH);
    if (nbytes == 0) {
        throw new IOException();
    }
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("hex dump: 1st 4bytes =>" + new String(Hex.encodeHex(magic_number)) + "<-");
    }
    if (magic_number[2] != 1) {
        dbgLog.fine("3rd byte is not 1: given file is not stata-dta type");
        throw new IllegalArgumentException("The file is not in a STATA format that we can read or support.");
    } else if ((magic_number[1] != 1) && (magic_number[1] != 2)) {
        dbgLog.fine("2nd byte is neither 0 nor 1: this file is not stata-dta type");
        throw new IllegalArgumentException("given file is not stata-dta type");
    } else if (!STATA_RELEASE_NUMBER.containsKey((int) magic_number[0])) {
        dbgLog.fine("1st byte (" + magic_number[0] + ") is not within the ingestable range [rel. 3-10]:" + "we cannot ingest this Stata file.");
        throw new IllegalArgumentException("given file is not stata-dta type");
    } else {
        releaseNumber = magic_number[0];
        init();
        dataTable.setOriginalFileFormat(MIME_TYPE[0]);
        /* 
             * releaseNumber: 
             * for storing in the datatable, we are converting the numeric Stata
             * release number into a more user friendly "version number"; 
             * e.g., "release number 115" = "Stata v. 12"
             * -- L.A. 4.0 
             */
        dataTable.setOriginalFormatVersion(STATA_RELEASE_NUMBER.get(releaseNumber));
        dataTable.setUnf("UNF:6:FILEFILEFILEFILE");
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("this file is stata-dta type: " + STATA_RELEASE_NUMBER.get(releaseNumber) + " (that means Stata version " + releaseNumber + ")");
        }
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("Endian(file)(Big: 1; Little:2)=" + magic_number[1]);
        }
        /* 
             * byte order: defined in the second byte of the "magic number": 
             */
        if (magic_number[1] == 2) {
            isLittleEndian = true;
            dbgLog.fine("Reversal of the bytes is necessary to decode " + "multi-byte fields");
        }
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("Endian of this platform:" + ByteOrder.nativeOrder().toString());
        }
    }
    dbgLog.fine("reading the remaining header segument 2: 60 or 109-byte");
    byte[] header = new byte[headerLength];
    nbytes = stream.read(header, 0, headerLength);
    // 1. number of variables: short (2 bytes)
    ByteBuffer bbnvar = ByteBuffer.wrap(header, 0, NVAR_FIELD_LENGTH);
    ByteBuffer dupnvar = bbnvar.duplicate();
    short short_nvar = dupnvar.getShort();
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("get original short view(nvar)=" + short_nvar);
    }
    if (isLittleEndian) {
        bbnvar.order(ByteOrder.LITTLE_ENDIAN);
    }
    short shrt_nvar = bbnvar.getShort();
    dataTable.setVarQuantity(new Long(shrt_nvar));
    int nvar = shrt_nvar;
    if (dbgLog.isLoggable(Level.INFO)) {
        dbgLog.info("number of variables(nvar)=" + nvar);
    }
    // 4.0 Initialize dataverse variable objects:
    List<DataVariable> variableList = new ArrayList<>();
    for (int i = 0; i < nvar; i++) {
        DataVariable dv = new DataVariable();
        dv.setInvalidRanges(new ArrayList<>());
        dv.setSummaryStatistics(new ArrayList<>());
        dv.setUnf("UNF:6:XXX");
        dv.setCategories(new ArrayList<>());
        variableList.add(dv);
        dv.setFileOrder(i);
        dv.setDataTable(dataTable);
    }
    dataTable.setDataVariables(variableList);
    // setup variableTypeList
    variableTypes = new String[nvar];
    // and the date/time format list:
    dateVariableFormats = new String[nvar];
    // 2. number of observations: int (4 bytes)
    ByteBuffer nobs = ByteBuffer.wrap(header, NVAR_FIELD_LENGTH, NOBS_FIELD_LENGTH);
    ByteBuffer dupnobs = nobs.duplicate();
    int int_dupnobs = dupnobs.getInt();
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("raw nobs=" + int_dupnobs);
    }
    if (isLittleEndian) {
        nobs.order(ByteOrder.LITTLE_ENDIAN);
    }
    int int_nobs = nobs.getInt();
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("reversed nobs=" + int_nobs);
    }
    // smd.getFileInformation().put("caseQnty", new Integer(int_nobs));
    dataTable.setCaseQuantity(new Long(int_nobs));
    /* 
         the "data label" - 
         note that we are not using this label for anything 
         (wonder what it is though? can we use it somewhere?)
         but we still need to extract it from the byte stream, 
         since the offsets of the objects stored further up
         are calculated relative to it. -- L.A., 4.0
         */
    // 3. data_label: 32 or 81 bytes
    int dl_offset = NVAR_FIELD_LENGTH + NOBS_FIELD_LENGTH;
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("dl_offset=" + dl_offset);
    }
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("data_label_length=" + dataLabelLength);
    }
    String data_label = new String(Arrays.copyOfRange(header, dl_offset, (dl_offset + dataLabelLength)), "ISO-8859-1");
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("data_label_length=" + data_label.length());
    }
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("loation of the null character=" + data_label.indexOf(0));
    }
    String dataLabel = getNullStrippedString(data_label);
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("data_label_length=" + dataLabel.length());
    }
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("data_label=[" + dataLabel + "]");
    }
    // added after release 4
    if (releaseNumber > 104) {
        int ts_offset = dl_offset + dataLabelLength;
        String time_stamp = new String(Arrays.copyOfRange(header, ts_offset, ts_offset + TIME_STAMP_LENGTH), "ISO-8859-1");
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("time_stamp_length=" + time_stamp.length());
        }
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("loation of the null character=" + time_stamp.indexOf(0));
        }
        String timeStamp = getNullStrippedString(time_stamp);
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("timeStamp_length=" + timeStamp.length());
        }
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("timeStamp=[" + timeStamp + "]");
        }
    }
}
Also used : DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable)

Aggregations

DataVariable (edu.harvard.iq.dataverse.datavariable.DataVariable)25 DataFile (edu.harvard.iq.dataverse.DataFile)8 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)5 Dataset (edu.harvard.iq.dataverse.Dataset)4 Dataverse (edu.harvard.iq.dataverse.Dataverse)4 FileInputStream (java.io.FileInputStream)4 FileMetadata (edu.harvard.iq.dataverse.FileMetadata)3 VariableCategory (edu.harvard.iq.dataverse.datavariable.VariableCategory)3 File (java.io.File)3 FileNotFoundException (java.io.FileNotFoundException)3 InputStream (java.io.InputStream)3 DataTable (edu.harvard.iq.dataverse.DataTable)2 SummaryStatistic (edu.harvard.iq.dataverse.datavariable.SummaryStatistic)2 VariableRange (edu.harvard.iq.dataverse.datavariable.VariableRange)2 NoSuchAlgorithmException (java.security.NoSuchAlgorithmException)2 SimpleDateFormat (java.text.SimpleDateFormat)2 Date (java.util.Date)2 HashMap (java.util.HashMap)2 LinkedHashMap (java.util.LinkedHashMap)2