Search in sources :

Example 1 with JSONObject

use of org.apache.wink.json4j.JSONObject in project incubator-systemml by apache.

the class DataTransform method processSpecFile.

/**
	 * Convert input transformation specification file with column names into a
	 * specification with corresponding column Ids. This file is sent to all the
	 * relevant MR jobs.
	 * 
	 * @param fs file system
	 * @param inputPath input file path
	 * @param smallestFile file name
	 * @param colNames column names
	 * @param prop csv file format properties
	 * @param specFileWithNames ?
	 * @return specification as a JSONObject
	 * @throws IllegalArgumentException if IllegalArgumentException occurs
	 * @throws IOException if IOException occurs
	 * @throws JSONException if JSONException occurs
	 */
private static String processSpecFile(FileSystem fs, String inputPath, String smallestFile, HashMap<String, Integer> colNames, CSVFileFormatProperties prop, String specWithNames) throws IllegalArgumentException, IOException, JSONException {
    JSONObject inputSpec = new JSONObject(specWithNames);
    final String NAME = "name";
    final String ID = "id";
    final String METHOD = "method";
    final String VALUE = "value";
    final String MV_METHOD_MEAN = "global_mean";
    final String MV_METHOD_MODE = "global_mode";
    final String MV_METHOD_CONSTANT = "constant";
    final String BIN_METHOD_WIDTH = "equi-width";
    final String BIN_METHOD_HEIGHT = "equi-height";
    final String SCALE_METHOD_Z = "z-score";
    final String SCALE_METHOD_M = "mean-subtraction";
    final String JSON_BYPOS = "ids";
    String stmp = null;
    JSONObject entry = null;
    byte btmp = 0;
    final int[] mvList;
    int[] rcdList, dcdList, omitList;
    final int[] binList;
    final int[] scaleList;
    byte[] mvMethods = null, binMethods = null, scaleMethods = null;
    Object[] numBins = null;
    Object[] mvConstants = null;
    boolean byPositions = (inputSpec.containsKey(JSON_BYPOS) && ((Boolean) inputSpec.get(JSON_BYPOS)).booleanValue() == true);
    // Omit
    if (inputSpec.containsKey(TfUtils.TXMETHOD_OMIT)) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_OMIT);
        omitList = new int[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            if (byPositions)
                omitList[i] = UtilFunctions.toInt(arrtmp.get(i));
            else {
                stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                omitList[i] = colNames.get(stmp);
            }
        }
        Arrays.sort(omitList);
    } else
        omitList = null;
    // Missing value imputation
    if (inputSpec.containsKey(TfUtils.TXMETHOD_IMPUTE)) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_IMPUTE);
        mvList = new int[arrtmp.size()];
        mvMethods = new byte[arrtmp.size()];
        mvConstants = new Object[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            entry = (JSONObject) arrtmp.get(i);
            if (byPositions) {
                mvList[i] = UtilFunctions.toInt(entry.get(ID));
            } else {
                stmp = UtilFunctions.unquote((String) entry.get(NAME));
                mvList[i] = colNames.get(stmp);
            }
            stmp = UtilFunctions.unquote((String) entry.get(METHOD));
            if (stmp.equals(MV_METHOD_MEAN))
                btmp = (byte) 1;
            else if (stmp.equals(MV_METHOD_MODE))
                btmp = (byte) 2;
            else if (stmp.equals(MV_METHOD_CONSTANT))
                btmp = (byte) 3;
            else
                throw new IOException("Unknown missing value imputation method (" + stmp + ") in transformation specification: " + specWithNames);
            mvMethods[i] = btmp;
            //txMethods.add( btmp );
            mvConstants[i] = null;
            if (entry.containsKey(VALUE))
                mvConstants[i] = entry.get(VALUE);
        }
        Integer[] idx = new Integer[mvList.length];
        for (int i = 0; i < mvList.length; i++) idx[i] = i;
        Arrays.sort(idx, new Comparator<Integer>() {

            @Override
            public int compare(Integer o1, Integer o2) {
                return (mvList[o1] - mvList[o2]);
            }
        });
        // rearrange mvList, mvMethods, and mvConstants according to permutation idx
        inplacePermute(mvList, mvMethods, mvConstants, idx);
    } else
        mvList = null;
    // Recoding
    if (inputSpec.containsKey(TfUtils.TXMETHOD_RECODE)) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_RECODE);
        rcdList = new int[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            if (byPositions)
                rcdList[i] = UtilFunctions.toInt(arrtmp.get(i));
            else {
                stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                rcdList[i] = colNames.get(stmp);
            }
        }
        Arrays.sort(rcdList);
    } else
        rcdList = null;
    // Binning
    if (inputSpec.containsKey(TfUtils.TXMETHOD_BIN)) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_BIN);
        binList = new int[arrtmp.size()];
        binMethods = new byte[arrtmp.size()];
        numBins = new Object[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            entry = (JSONObject) arrtmp.get(i);
            if (byPositions) {
                binList[i] = UtilFunctions.toInt(entry.get(ID));
            } else {
                stmp = UtilFunctions.unquote((String) entry.get(NAME));
                binList[i] = colNames.get(stmp);
            }
            stmp = UtilFunctions.unquote((String) entry.get(METHOD));
            if (stmp.equals(BIN_METHOD_WIDTH))
                btmp = (byte) 1;
            else if (stmp.equals(BIN_METHOD_HEIGHT))
                throw new IOException("Equi-height binning method is not yet supported, in transformation specification: " + specWithNames);
            else
                throw new IOException("Unknown missing value imputation method (" + stmp + ") in transformation specification: " + specWithNames);
            binMethods[i] = btmp;
            numBins[i] = entry.get(TfUtils.JSON_NBINS);
            if (((Integer) numBins[i]).intValue() <= 1)
                throw new IllegalArgumentException("Invalid transformation on column \"" + (String) entry.get(NAME) + "\". Number of bins must be greater than 1.");
        }
        Integer[] idx = new Integer[binList.length];
        for (int i = 0; i < binList.length; i++) idx[i] = i;
        Arrays.sort(idx, new Comparator<Integer>() {

            @Override
            public int compare(Integer o1, Integer o2) {
                return (binList[o1] - binList[o2]);
            }
        });
        // rearrange binList and binMethods according to permutation idx
        inplacePermute(binList, binMethods, numBins, idx);
    } else
        binList = null;
    // Dummycoding
    if (inputSpec.containsKey(TfUtils.TXMETHOD_DUMMYCODE)) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_DUMMYCODE);
        dcdList = new int[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            if (byPositions)
                dcdList[i] = UtilFunctions.toInt(arrtmp.get(i));
            else {
                stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                dcdList[i] = colNames.get(stmp);
            }
        }
        Arrays.sort(dcdList);
    } else
        dcdList = null;
    // Scaling
    if (inputSpec.containsKey(TfUtils.TXMETHOD_SCALE)) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_SCALE);
        scaleList = new int[arrtmp.size()];
        scaleMethods = new byte[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            entry = (JSONObject) arrtmp.get(i);
            if (byPositions) {
                scaleList[i] = UtilFunctions.toInt(entry.get(ID));
            } else {
                stmp = UtilFunctions.unquote((String) entry.get(NAME));
                scaleList[i] = colNames.get(stmp);
            }
            stmp = UtilFunctions.unquote((String) entry.get(METHOD));
            if (stmp.equals(SCALE_METHOD_M))
                btmp = (byte) 1;
            else if (stmp.equals(SCALE_METHOD_Z))
                btmp = (byte) 2;
            else
                throw new IOException("Unknown missing value imputation method (" + stmp + ") in transformation specification: " + specWithNames);
            scaleMethods[i] = btmp;
        }
        Integer[] idx = new Integer[scaleList.length];
        for (int i = 0; i < scaleList.length; i++) idx[i] = i;
        Arrays.sort(idx, new Comparator<Integer>() {

            @Override
            public int compare(Integer o1, Integer o2) {
                return (scaleList[o1] - scaleList[o2]);
            }
        });
        // rearrange scaleList and scaleMethods according to permutation idx
        inplacePermute(scaleList, scaleMethods, null, idx);
    } else
        scaleList = null;
    // --------------------------------------------------------------------------
    // check for column IDs that are imputed with mode, but not recoded
    // These columns have be handled separately, because the computation of mode 
    // requires the computation of distinct values (i.e., recode maps)
    ArrayList<Integer> tmpList = new ArrayList<Integer>();
    if (mvList != null)
        for (int i = 0; i < mvList.length; i++) {
            int colID = mvList[i];
            if (mvMethods[i] == 2 && (rcdList == null || Arrays.binarySearch(rcdList, colID) < 0))
                tmpList.add(colID);
        }
    int[] mvrcdList = null;
    if (tmpList.size() > 0) {
        mvrcdList = new int[tmpList.size()];
        for (int i = 0; i < tmpList.size(); i++) mvrcdList[i] = tmpList.get(i);
    }
    if (mvList != null)
        for (int i = 0; i < mvList.length; i++) {
            int colID = mvList[i];
            if (omitList != null && Arrays.binarySearch(omitList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be both omitted and imputed.");
            if (mvMethods[i] == 1) {
                if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0)
                    throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A numeric column can not be recoded.");
                if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0)
                    // throw an error only if the column is not binned
                    if (binList == null || Arrays.binarySearch(binList, colID) < 0)
                        throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A numeric column can not be dummycoded.");
            }
        }
    if (scaleList != null)
        for (int i = 0; i < scaleList.length; i++) {
            int colID = scaleList[i];
            if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be recoded and scaled.");
            if (binList != null && Arrays.binarySearch(binList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be binned and scaled.");
            if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be dummycoded and scaled.");
        }
    if (rcdList != null)
        for (int i = 0; i < rcdList.length; i++) {
            int colID = rcdList[i];
            if (binList != null && Arrays.binarySearch(binList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be recoded and binned.");
        }
    // Check if dummycoded columns are either recoded or binned.
    // If not, add them to recode list.
    ArrayList<Integer> addToRcd = new ArrayList<Integer>();
    if (dcdList != null)
        for (int i = 0; i < dcdList.length; i++) {
            int colID = dcdList[i];
            boolean isRecoded = (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0);
            boolean isBinned = (binList != null && Arrays.binarySearch(binList, colID) >= 0);
            // If colID is neither recoded nor binned, then, add it to rcdList.
            if (!isRecoded && !isBinned)
                addToRcd.add(colID);
        }
    if (addToRcd.size() > 0) {
        int[] newRcdList = null;
        if (rcdList != null)
            newRcdList = Arrays.copyOf(rcdList, rcdList.length + addToRcd.size());
        else
            newRcdList = new int[addToRcd.size()];
        int i = (rcdList != null ? rcdList.length : 0);
        for (int idx = 0; i < newRcdList.length; i++, idx++) newRcdList[i] = addToRcd.get(idx);
        Arrays.sort(newRcdList);
        rcdList = newRcdList;
    }
    // -----------------------------------------------------------------------------
    // Prepare output spec
    JSONObject outputSpec = new JSONObject();
    if (omitList != null) {
        JSONObject rcdSpec = new JSONObject();
        rcdSpec.put(TfUtils.JSON_ATTRS, toJSONArray(omitList));
        outputSpec.put(TfUtils.TXMETHOD_OMIT, rcdSpec);
    }
    if (mvList != null) {
        JSONObject mvSpec = new JSONObject();
        mvSpec.put(TfUtils.JSON_ATTRS, toJSONArray(mvList));
        mvSpec.put(TfUtils.JSON_MTHD, toJSONArray(mvMethods));
        mvSpec.put(TfUtils.JSON_CONSTS, toJSONArray(mvConstants));
        outputSpec.put(TfUtils.TXMETHOD_IMPUTE, mvSpec);
    }
    if (rcdList != null) {
        JSONObject rcdSpec = new JSONObject();
        rcdSpec.put(TfUtils.JSON_ATTRS, toJSONArray(rcdList));
        outputSpec.put(TfUtils.TXMETHOD_RECODE, rcdSpec);
    }
    if (binList != null) {
        JSONObject binSpec = new JSONObject();
        binSpec.put(TfUtils.JSON_ATTRS, toJSONArray(binList));
        binSpec.put(TfUtils.JSON_MTHD, toJSONArray(binMethods));
        binSpec.put(TfUtils.JSON_NBINS, toJSONArray(numBins));
        outputSpec.put(TfUtils.TXMETHOD_BIN, binSpec);
    }
    if (dcdList != null) {
        JSONObject dcdSpec = new JSONObject();
        dcdSpec.put(TfUtils.JSON_ATTRS, toJSONArray(dcdList));
        outputSpec.put(TfUtils.TXMETHOD_DUMMYCODE, dcdSpec);
    }
    if (scaleList != null) {
        JSONObject scaleSpec = new JSONObject();
        scaleSpec.put(TfUtils.JSON_ATTRS, toJSONArray(scaleList));
        scaleSpec.put(TfUtils.JSON_MTHD, toJSONArray(scaleMethods));
        outputSpec.put(TfUtils.TXMETHOD_SCALE, scaleSpec);
    }
    if (mvrcdList != null) {
        JSONObject mvrcd = new JSONObject();
        mvrcd.put(TfUtils.JSON_ATTRS, toJSONArray(mvrcdList));
        outputSpec.put(TfUtils.TXMETHOD_MVRCD, mvrcd);
    }
    // return output spec with IDs
    return outputSpec.toString();
}
Also used : JSONArray(org.apache.wink.json4j.JSONArray) ArrayList(java.util.ArrayList) IOException(java.io.IOException) JSONObject(org.apache.wink.json4j.JSONObject) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) JSONObject(org.apache.wink.json4j.JSONObject) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject)

Example 2 with JSONObject

use of org.apache.wink.json4j.JSONObject in project incubator-systemml by apache.

the class DataTransform method performTransform.

/**
	 * Main method to create and/or apply transformation metdata in-memory, on a single node.
	 * 
	 * @param job job configuration
	 * @param fs file system
	 * @param inputPath path to input files
	 * @param ncols number of columns
	 * @param prop csv file format properties
	 * @param specWithIDs JSON transform specification with IDs
	 * @param tfMtdPath transform metadata path
	 * @param isApply ?
	 * @param result output matrix
	 * @param headerLine header line
	 * @param isBB true if binary block
	 * @param isCSV true if CSV
	 * @return MR job result
	 * @throws IOException if IOException occurs
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 * @throws IllegalArgumentException if IllegalArgumentException occurs
	 * @throws JSONException if JSONException occurs
	 */
private static JobReturn performTransform(JobConf job, FileSystem fs, String inputPath, int ncols, CSVFileFormatProperties prop, String specWithIDs, String tfMtdPath, boolean isApply, MatrixObject result, String headerLine, boolean isBB, boolean isCSV) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
    String[] na = TfUtils.parseNAStrings(prop.getNAStrings());
    JSONObject spec = new JSONObject(specWithIDs);
    TfUtils agents = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), na, spec, ncols, tfMtdPath, null, null);
    MVImputeAgent _mia = agents.getMVImputeAgent();
    RecodeAgent _ra = agents.getRecodeAgent();
    BinAgent _ba = agents.getBinAgent();
    DummycodeAgent _da = agents.getDummycodeAgent();
    // List of files to read
    ArrayList<Path> files = collectInputFiles(inputPath, fs);
    // ---------------------------------
    // Construct transformation metadata
    // ---------------------------------
    String line = null;
    String[] words = null;
    int numColumnsTf = 0;
    if (!isApply) {
        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
                if (fileNo == 0 && prop.hasHeader())
                    //ignore header
                    br.readLine();
                line = null;
                while ((line = br.readLine()) != null) {
                    agents.prepareTfMtd(line);
                }
            }
        }
        if (agents.getValid() == 0)
            throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
        _mia.outputTransformationMetadata(tfMtdPath, fs, agents);
        _ba.outputTransformationMetadata(tfMtdPath, fs, agents);
        _ra.outputTransformationMetadata(tfMtdPath, fs, agents);
        // prepare agents for the subsequent phase of applying transformation metadata
        // NO need to loadTxMtd for _ra, since the maps are already present in the memory
        Path tmp = new Path(tfMtdPath);
        _mia.loadTxMtd(job, fs, tmp, agents);
        _ba.loadTxMtd(job, fs, tmp, agents);
        _da.setRecodeMapsCP(_ra.getCPRecodeMaps());
        _da.setNumBins(_ba.getColList(), _ba.getNumBins());
        _da.loadTxMtd(job, fs, tmp, agents);
    } else {
        // Count the number of rows
        int[] rows = countNumRows(files, prop, fs, agents);
        agents.setTotal(rows[0]);
        agents.setValid(rows[1]);
        if (agents.getValid() == 0)
            throw new DMLRuntimeException("Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed.");
        // Load transformation metadata
        // prepare agents for the subsequent phase of applying transformation metadata
        Path tmp = new Path(tfMtdPath);
        _mia.loadTxMtd(job, fs, tmp, agents);
        _ra.loadTxMtd(job, fs, tmp, agents);
        _ba.loadTxMtd(job, fs, tmp, agents);
        _da.setRecodeMaps(_ra.getRecodeMaps());
        _da.setNumBins(_ba.getColList(), _ba.getNumBins());
        _da.loadTxMtd(job, fs, tmp, agents);
    }
    // -----------------------------
    // Apply transformation metadata
    // -----------------------------
    numColumnsTf = getNumColumnsTf(fs, headerLine, prop.getDelim(), tfMtdPath);
    MapReduceTool.deleteFileIfExistOnHDFS(result.getFileName());
    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(result.getFileName()), true)));
    StringBuilder sb = new StringBuilder();
    try {
        MatrixBlock mb = null;
        if (isBB) {
            int estNNZ = (int) agents.getValid() * ncols;
            mb = new MatrixBlock((int) agents.getValid(), numColumnsTf, estNNZ);
            if (mb.isInSparseFormat())
                mb.allocateSparseRowsBlock();
            else
                mb.allocateDenseBlock();
        }
        // rowid to be used in filling the matrix block
        int rowID = 0;
        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
                if (fileNo == 0) {
                    if (prop.hasHeader())
                        // ignore the header line from data file
                        br.readLine();
                    //TODO: fix hard-wired header propagation to meta data column names
                    String dcdHeader = _da.constructDummycodedHeader(headerLine, agents.getDelim());
                    numColumnsTf = _da.genDcdMapsAndColTypes(fs, tfMtdPath, ncols, agents);
                    generateHeaderFiles(fs, tfMtdPath, headerLine, dcdHeader);
                }
                line = null;
                while ((line = br.readLine()) != null) {
                    words = agents.getWords(line);
                    if (!agents.omit(words)) {
                        words = agents.apply(words);
                        if (isCSV) {
                            out.write(agents.checkAndPrepOutputString(words, sb));
                            out.write("\n");
                        }
                        if (isBB) {
                            agents.check(words);
                            for (int c = 0; c < words.length; c++) {
                                if (words[c] == null || words[c].isEmpty())
                                    ;
                                else
                                    mb.appendValue(rowID, c, UtilFunctions.parseToDouble(words[c]));
                            }
                        }
                        rowID++;
                    }
                }
            }
        }
        if (mb != null) {
            mb.recomputeNonZeros();
            mb.examSparsity();
            result.acquireModify(mb);
            result.release();
            result.exportData();
        }
    } finally {
        IOUtilFunctions.closeSilently(out);
    }
    MatrixCharacteristics mc = new MatrixCharacteristics(agents.getValid(), numColumnsTf, (int) result.getNumRowsPerBlock(), (int) result.getNumColumnsPerBlock());
    JobReturn ret = new JobReturn(new MatrixCharacteristics[] { mc }, true);
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) InputStreamReader(java.io.InputStreamReader) JobReturn(org.apache.sysml.runtime.matrix.JobReturn) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) BufferedWriter(java.io.BufferedWriter) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) JSONObject(org.apache.wink.json4j.JSONObject) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter)

Example 3 with JSONObject

use of org.apache.wink.json4j.JSONObject in project incubator-systemml by apache.

the class MVImputeAgent method parseMethodsAndReplacments.

private void parseMethodsAndReplacments(JSONObject parsedSpec) throws JSONException {
    JSONArray mvspec = (JSONArray) parsedSpec.get(TfUtils.TXMETHOD_IMPUTE);
    _mvMethodList = new MVMethod[mvspec.size()];
    _replacementList = new String[mvspec.size()];
    _meanList = new KahanObject[mvspec.size()];
    _countList = new long[mvspec.size()];
    for (int i = 0; i < mvspec.size(); i++) {
        JSONObject mvobj = (JSONObject) mvspec.get(i);
        _mvMethodList[i] = MVMethod.valueOf(mvobj.get("method").toString().toUpperCase());
        if (_mvMethodList[i] == MVMethod.CONSTANT) {
            _replacementList[i] = mvobj.getString("value").toString();
        }
        _meanList[i] = new KahanObject(0, 0);
    }
}
Also used : JSONObject(org.apache.wink.json4j.JSONObject) JSONArray(org.apache.wink.json4j.JSONArray) KahanObject(org.apache.sysml.runtime.instructions.cp.KahanObject)

Example 4 with JSONObject

use of org.apache.wink.json4j.JSONObject in project incubator-systemml by apache.

the class DecoderFactory method createDecoder.

@SuppressWarnings("unchecked")
public static Decoder createDecoder(String spec, String[] colnames, ValueType[] schema, FrameBlock meta) throws DMLRuntimeException {
    Decoder decoder = null;
    try {
        //parse transform specification
        JSONObject jSpec = new JSONObject(spec);
        List<Decoder> ldecoders = new ArrayList<Decoder>();
        //create decoders 'recode', 'dummy' and 'pass-through'
        List<Integer> rcIDs = Arrays.asList(ArrayUtils.toObject(TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_RECODE)));
        List<Integer> dcIDs = Arrays.asList(ArrayUtils.toObject(TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE)));
        rcIDs = new ArrayList<Integer>(CollectionUtils.union(rcIDs, dcIDs));
        List<Integer> ptIDs = new ArrayList<Integer>(CollectionUtils.subtract(UtilFunctions.getSequenceList(1, meta.getNumColumns(), 1), rcIDs));
        //create default schema if unspecified (with double columns for pass-through)
        if (schema == null) {
            schema = UtilFunctions.nCopies(meta.getNumColumns(), ValueType.STRING);
            for (Integer col : ptIDs) schema[col - 1] = ValueType.DOUBLE;
        }
        if (!dcIDs.isEmpty()) {
            ldecoders.add(new DecoderDummycode(schema, ArrayUtils.toPrimitive(dcIDs.toArray(new Integer[0]))));
        }
        if (!rcIDs.isEmpty()) {
            ldecoders.add(new DecoderRecode(schema, !dcIDs.isEmpty(), ArrayUtils.toPrimitive(rcIDs.toArray(new Integer[0]))));
        }
        if (!ptIDs.isEmpty()) {
            ldecoders.add(new DecoderPassThrough(schema, ArrayUtils.toPrimitive(ptIDs.toArray(new Integer[0])), ArrayUtils.toPrimitive(dcIDs.toArray(new Integer[0]))));
        }
        //create composite decoder of all created decoders
        //and initialize with given meta data (recode, dummy, bin)
        decoder = new DecoderComposite(schema, ldecoders);
        if (meta != null)
            decoder.initMetaData(meta);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    return decoder;
}
Also used : ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) JSONObject(org.apache.wink.json4j.JSONObject)

Example 5 with JSONObject

use of org.apache.wink.json4j.JSONObject in project incubator-systemml by apache.

the class ApplyTfCSVSPARK method runSparkJob.

/**
	 * Apply transformation metadata and generate the result in CSV format, as a
	 * JavaRDD of Strings.
	 * 
	 * @param sec spark execution context
	 * @param inputRDD input rdd
	 * @param tfMtdPath transform metadata path
	 * @param spec transform specification as json string
	 * @param tmpPath temporary file path
	 * @param prop csv file format properties
	 * @param numCols number of columns
	 * @param headerLine header line
	 * @return JavaPairRDD of long-strings
	 * @throws IOException if IOException occurs
	 * @throws ClassNotFoundException if ClassNotFoundException occurs
	 * @throws InterruptedException if InterruptedException occurs
	 * @throws IllegalArgumentException if IllegalArgumentException occurs
	 * @throws JSONException if JSONException occurs
	 */
public static JavaPairRDD<Long, String> runSparkJob(SparkExecutionContext sec, JavaRDD<Tuple2<LongWritable, Text>> inputRDD, String tfMtdPath, String spec, String tmpPath, CSVFileFormatProperties prop, int numCols, String headerLine) throws IOException, ClassNotFoundException, InterruptedException, IllegalArgumentException, JSONException {
    // Load transformation metadata and broadcast it
    String[] naStrings = TfUtils.parseNAStrings(prop.getNAStrings());
    JSONObject jspec = new JSONObject(spec);
    TfUtils _tfmapper = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), naStrings, jspec, numCols, tfMtdPath, null, tmpPath);
    _tfmapper.loadTfMetadata();
    Broadcast<TfUtils> bcast_tf = sec.getSparkContext().broadcast(_tfmapper);
    /*
		 * Construct transformation metadata (map-side) -- the logic is similar
		 * to GTFMTDMapper
		 * 
		 * Note: The result of mapPartitionsWithIndex is cached so that the
		 * transformed data is not redundantly computed multiple times
		 */
    JavaPairRDD<Long, String> applyRDD = inputRDD.mapPartitionsWithIndex(new ApplyTfCSVMap(bcast_tf), true).mapToPair(new PairFunction<String, Long, String>() {

        private static final long serialVersionUID = 3868143093999082931L;

        @Override
        public Tuple2<Long, String> call(String t) throws Exception {
            return new Tuple2<Long, String>(new Long(1), t);
        }
    }).cache();
    /*
		 * An action to force execution of apply()
		 * 
		 * We need to trigger the execution of this RDD so as to ensure the
		 * creation of a few metadata files (headers, dummycoded information,
		 * etc.), which are referenced in the caller function.
		 */
    applyRDD.count();
    return applyRDD;
}
Also used : JSONObject(org.apache.wink.json4j.JSONObject) Tuple2(scala.Tuple2) PairFunction(org.apache.spark.api.java.function.PairFunction)

Aggregations

JSONObject (org.apache.wink.json4j.JSONObject)23 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)14 IOException (java.io.IOException)12 Path (org.apache.hadoop.fs.Path)8 JSONArray (org.apache.wink.json4j.JSONArray)7 BufferedReader (java.io.BufferedReader)5 InputStreamReader (java.io.InputStreamReader)5 FileSystem (org.apache.hadoop.fs.FileSystem)5 DMLException (org.apache.sysml.api.DMLException)4 DataExpression (org.apache.sysml.parser.DataExpression)4 ParseException (org.apache.sysml.parser.ParseException)4 BufferedWriter (java.io.BufferedWriter)3 OutputStreamWriter (java.io.OutputStreamWriter)3 ArrayList (java.util.ArrayList)3 Entry (java.util.Map.Entry)2 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)2 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)2 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)2 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)2 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)2