use of org.apache.wink.json4j.JSONArray in project incubator-systemml by apache.
the class DataTransform method processSpecFile.
/**
* Convert input transformation specification file with column names into a
* specification with corresponding column Ids. This file is sent to all the
* relevant MR jobs.
*
* @param fs file system
* @param inputPath input file path
* @param smallestFile file name
* @param colNames column names
* @param prop csv file format properties
* @param specFileWithNames ?
* @return specification as a JSONObject
* @throws IllegalArgumentException if IllegalArgumentException occurs
* @throws IOException if IOException occurs
* @throws JSONException if JSONException occurs
*/
private static String processSpecFile(FileSystem fs, String inputPath, String smallestFile, HashMap<String, Integer> colNames, CSVFileFormatProperties prop, String specWithNames) throws IllegalArgumentException, IOException, JSONException {
JSONObject inputSpec = new JSONObject(specWithNames);
final String NAME = "name";
final String ID = "id";
final String METHOD = "method";
final String VALUE = "value";
final String MV_METHOD_MEAN = "global_mean";
final String MV_METHOD_MODE = "global_mode";
final String MV_METHOD_CONSTANT = "constant";
final String BIN_METHOD_WIDTH = "equi-width";
final String BIN_METHOD_HEIGHT = "equi-height";
final String SCALE_METHOD_Z = "z-score";
final String SCALE_METHOD_M = "mean-subtraction";
final String JSON_BYPOS = "ids";
String stmp = null;
JSONObject entry = null;
byte btmp = 0;
final int[] mvList;
int[] rcdList, dcdList, omitList;
final int[] binList;
final int[] scaleList;
byte[] mvMethods = null, binMethods = null, scaleMethods = null;
Object[] numBins = null;
Object[] mvConstants = null;
boolean byPositions = (inputSpec.containsKey(JSON_BYPOS) && ((Boolean) inputSpec.get(JSON_BYPOS)).booleanValue() == true);
// Omit
if (inputSpec.containsKey(TfUtils.TXMETHOD_OMIT)) {
JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_OMIT);
omitList = new int[arrtmp.size()];
for (int i = 0; i < arrtmp.size(); i++) {
if (byPositions)
omitList[i] = UtilFunctions.toInt(arrtmp.get(i));
else {
stmp = UtilFunctions.unquote((String) arrtmp.get(i));
omitList[i] = colNames.get(stmp);
}
}
Arrays.sort(omitList);
} else
omitList = null;
// Missing value imputation
if (inputSpec.containsKey(TfUtils.TXMETHOD_IMPUTE)) {
JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_IMPUTE);
mvList = new int[arrtmp.size()];
mvMethods = new byte[arrtmp.size()];
mvConstants = new Object[arrtmp.size()];
for (int i = 0; i < arrtmp.size(); i++) {
entry = (JSONObject) arrtmp.get(i);
if (byPositions) {
mvList[i] = UtilFunctions.toInt(entry.get(ID));
} else {
stmp = UtilFunctions.unquote((String) entry.get(NAME));
mvList[i] = colNames.get(stmp);
}
stmp = UtilFunctions.unquote((String) entry.get(METHOD));
if (stmp.equals(MV_METHOD_MEAN))
btmp = (byte) 1;
else if (stmp.equals(MV_METHOD_MODE))
btmp = (byte) 2;
else if (stmp.equals(MV_METHOD_CONSTANT))
btmp = (byte) 3;
else
throw new IOException("Unknown missing value imputation method (" + stmp + ") in transformation specification: " + specWithNames);
mvMethods[i] = btmp;
//txMethods.add( btmp );
mvConstants[i] = null;
if (entry.containsKey(VALUE))
mvConstants[i] = entry.get(VALUE);
}
Integer[] idx = new Integer[mvList.length];
for (int i = 0; i < mvList.length; i++) idx[i] = i;
Arrays.sort(idx, new Comparator<Integer>() {
@Override
public int compare(Integer o1, Integer o2) {
return (mvList[o1] - mvList[o2]);
}
});
// rearrange mvList, mvMethods, and mvConstants according to permutation idx
inplacePermute(mvList, mvMethods, mvConstants, idx);
} else
mvList = null;
// Recoding
if (inputSpec.containsKey(TfUtils.TXMETHOD_RECODE)) {
JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_RECODE);
rcdList = new int[arrtmp.size()];
for (int i = 0; i < arrtmp.size(); i++) {
if (byPositions)
rcdList[i] = UtilFunctions.toInt(arrtmp.get(i));
else {
stmp = UtilFunctions.unquote((String) arrtmp.get(i));
rcdList[i] = colNames.get(stmp);
}
}
Arrays.sort(rcdList);
} else
rcdList = null;
// Binning
if (inputSpec.containsKey(TfUtils.TXMETHOD_BIN)) {
JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_BIN);
binList = new int[arrtmp.size()];
binMethods = new byte[arrtmp.size()];
numBins = new Object[arrtmp.size()];
for (int i = 0; i < arrtmp.size(); i++) {
entry = (JSONObject) arrtmp.get(i);
if (byPositions) {
binList[i] = UtilFunctions.toInt(entry.get(ID));
} else {
stmp = UtilFunctions.unquote((String) entry.get(NAME));
binList[i] = colNames.get(stmp);
}
stmp = UtilFunctions.unquote((String) entry.get(METHOD));
if (stmp.equals(BIN_METHOD_WIDTH))
btmp = (byte) 1;
else if (stmp.equals(BIN_METHOD_HEIGHT))
throw new IOException("Equi-height binning method is not yet supported, in transformation specification: " + specWithNames);
else
throw new IOException("Unknown missing value imputation method (" + stmp + ") in transformation specification: " + specWithNames);
binMethods[i] = btmp;
numBins[i] = entry.get(TfUtils.JSON_NBINS);
if (((Integer) numBins[i]).intValue() <= 1)
throw new IllegalArgumentException("Invalid transformation on column \"" + (String) entry.get(NAME) + "\". Number of bins must be greater than 1.");
}
Integer[] idx = new Integer[binList.length];
for (int i = 0; i < binList.length; i++) idx[i] = i;
Arrays.sort(idx, new Comparator<Integer>() {
@Override
public int compare(Integer o1, Integer o2) {
return (binList[o1] - binList[o2]);
}
});
// rearrange binList and binMethods according to permutation idx
inplacePermute(binList, binMethods, numBins, idx);
} else
binList = null;
// Dummycoding
if (inputSpec.containsKey(TfUtils.TXMETHOD_DUMMYCODE)) {
JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_DUMMYCODE);
dcdList = new int[arrtmp.size()];
for (int i = 0; i < arrtmp.size(); i++) {
if (byPositions)
dcdList[i] = UtilFunctions.toInt(arrtmp.get(i));
else {
stmp = UtilFunctions.unquote((String) arrtmp.get(i));
dcdList[i] = colNames.get(stmp);
}
}
Arrays.sort(dcdList);
} else
dcdList = null;
// Scaling
if (inputSpec.containsKey(TfUtils.TXMETHOD_SCALE)) {
JSONArray arrtmp = (JSONArray) inputSpec.get(TfUtils.TXMETHOD_SCALE);
scaleList = new int[arrtmp.size()];
scaleMethods = new byte[arrtmp.size()];
for (int i = 0; i < arrtmp.size(); i++) {
entry = (JSONObject) arrtmp.get(i);
if (byPositions) {
scaleList[i] = UtilFunctions.toInt(entry.get(ID));
} else {
stmp = UtilFunctions.unquote((String) entry.get(NAME));
scaleList[i] = colNames.get(stmp);
}
stmp = UtilFunctions.unquote((String) entry.get(METHOD));
if (stmp.equals(SCALE_METHOD_M))
btmp = (byte) 1;
else if (stmp.equals(SCALE_METHOD_Z))
btmp = (byte) 2;
else
throw new IOException("Unknown missing value imputation method (" + stmp + ") in transformation specification: " + specWithNames);
scaleMethods[i] = btmp;
}
Integer[] idx = new Integer[scaleList.length];
for (int i = 0; i < scaleList.length; i++) idx[i] = i;
Arrays.sort(idx, new Comparator<Integer>() {
@Override
public int compare(Integer o1, Integer o2) {
return (scaleList[o1] - scaleList[o2]);
}
});
// rearrange scaleList and scaleMethods according to permutation idx
inplacePermute(scaleList, scaleMethods, null, idx);
} else
scaleList = null;
// --------------------------------------------------------------------------
// check for column IDs that are imputed with mode, but not recoded
// These columns have be handled separately, because the computation of mode
// requires the computation of distinct values (i.e., recode maps)
ArrayList<Integer> tmpList = new ArrayList<Integer>();
if (mvList != null)
for (int i = 0; i < mvList.length; i++) {
int colID = mvList[i];
if (mvMethods[i] == 2 && (rcdList == null || Arrays.binarySearch(rcdList, colID) < 0))
tmpList.add(colID);
}
int[] mvrcdList = null;
if (tmpList.size() > 0) {
mvrcdList = new int[tmpList.size()];
for (int i = 0; i < tmpList.size(); i++) mvrcdList[i] = tmpList.get(i);
}
if (mvList != null)
for (int i = 0; i < mvList.length; i++) {
int colID = mvList[i];
if (omitList != null && Arrays.binarySearch(omitList, colID) >= 0)
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be both omitted and imputed.");
if (mvMethods[i] == 1) {
if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0)
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A numeric column can not be recoded.");
if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0)
// throw an error only if the column is not binned
if (binList == null || Arrays.binarySearch(binList, colID) < 0)
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A numeric column can not be dummycoded.");
}
}
if (scaleList != null)
for (int i = 0; i < scaleList.length; i++) {
int colID = scaleList[i];
if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0)
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be recoded and scaled.");
if (binList != null && Arrays.binarySearch(binList, colID) >= 0)
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be binned and scaled.");
if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0)
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be dummycoded and scaled.");
}
if (rcdList != null)
for (int i = 0; i < rcdList.length; i++) {
int colID = rcdList[i];
if (binList != null && Arrays.binarySearch(binList, colID) >= 0)
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be recoded and binned.");
}
// Check if dummycoded columns are either recoded or binned.
// If not, add them to recode list.
ArrayList<Integer> addToRcd = new ArrayList<Integer>();
if (dcdList != null)
for (int i = 0; i < dcdList.length; i++) {
int colID = dcdList[i];
boolean isRecoded = (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0);
boolean isBinned = (binList != null && Arrays.binarySearch(binList, colID) >= 0);
// If colID is neither recoded nor binned, then, add it to rcdList.
if (!isRecoded && !isBinned)
addToRcd.add(colID);
}
if (addToRcd.size() > 0) {
int[] newRcdList = null;
if (rcdList != null)
newRcdList = Arrays.copyOf(rcdList, rcdList.length + addToRcd.size());
else
newRcdList = new int[addToRcd.size()];
int i = (rcdList != null ? rcdList.length : 0);
for (int idx = 0; i < newRcdList.length; i++, idx++) newRcdList[i] = addToRcd.get(idx);
Arrays.sort(newRcdList);
rcdList = newRcdList;
}
// -----------------------------------------------------------------------------
// Prepare output spec
JSONObject outputSpec = new JSONObject();
if (omitList != null) {
JSONObject rcdSpec = new JSONObject();
rcdSpec.put(TfUtils.JSON_ATTRS, toJSONArray(omitList));
outputSpec.put(TfUtils.TXMETHOD_OMIT, rcdSpec);
}
if (mvList != null) {
JSONObject mvSpec = new JSONObject();
mvSpec.put(TfUtils.JSON_ATTRS, toJSONArray(mvList));
mvSpec.put(TfUtils.JSON_MTHD, toJSONArray(mvMethods));
mvSpec.put(TfUtils.JSON_CONSTS, toJSONArray(mvConstants));
outputSpec.put(TfUtils.TXMETHOD_IMPUTE, mvSpec);
}
if (rcdList != null) {
JSONObject rcdSpec = new JSONObject();
rcdSpec.put(TfUtils.JSON_ATTRS, toJSONArray(rcdList));
outputSpec.put(TfUtils.TXMETHOD_RECODE, rcdSpec);
}
if (binList != null) {
JSONObject binSpec = new JSONObject();
binSpec.put(TfUtils.JSON_ATTRS, toJSONArray(binList));
binSpec.put(TfUtils.JSON_MTHD, toJSONArray(binMethods));
binSpec.put(TfUtils.JSON_NBINS, toJSONArray(numBins));
outputSpec.put(TfUtils.TXMETHOD_BIN, binSpec);
}
if (dcdList != null) {
JSONObject dcdSpec = new JSONObject();
dcdSpec.put(TfUtils.JSON_ATTRS, toJSONArray(dcdList));
outputSpec.put(TfUtils.TXMETHOD_DUMMYCODE, dcdSpec);
}
if (scaleList != null) {
JSONObject scaleSpec = new JSONObject();
scaleSpec.put(TfUtils.JSON_ATTRS, toJSONArray(scaleList));
scaleSpec.put(TfUtils.JSON_MTHD, toJSONArray(scaleMethods));
outputSpec.put(TfUtils.TXMETHOD_SCALE, scaleSpec);
}
if (mvrcdList != null) {
JSONObject mvrcd = new JSONObject();
mvrcd.put(TfUtils.JSON_ATTRS, toJSONArray(mvrcdList));
outputSpec.put(TfUtils.TXMETHOD_MVRCD, mvrcd);
}
// return output spec with IDs
return outputSpec.toString();
}
use of org.apache.wink.json4j.JSONArray in project incubator-systemml by apache.
the class MVImputeAgent method parseMethodsAndReplacments.
private void parseMethodsAndReplacments(JSONObject parsedSpec) throws JSONException {
JSONArray mvspec = (JSONArray) parsedSpec.get(TfUtils.TXMETHOD_IMPUTE);
_mvMethodList = new MVMethod[mvspec.size()];
_replacementList = new String[mvspec.size()];
_meanList = new KahanObject[mvspec.size()];
_countList = new long[mvspec.size()];
for (int i = 0; i < mvspec.size(); i++) {
JSONObject mvobj = (JSONObject) mvspec.get(i);
_mvMethodList[i] = MVMethod.valueOf(mvobj.get("method").toString().toUpperCase());
if (_mvMethodList[i] == MVMethod.CONSTANT) {
_replacementList[i] = mvobj.getString("value").toString();
}
_meanList[i] = new KahanObject(0, 0);
}
}
use of org.apache.wink.json4j.JSONArray in project incubator-systemml by apache.
the class DataExpression method parseMetaDataFileParameters.
@SuppressWarnings("unchecked")
private void parseMetaDataFileParameters(String mtdFileName, JSONObject configObject, boolean conditional) throws LanguageException {
for (Object obj : configObject.entrySet()) {
Entry<Object, Object> e = (Entry<Object, Object>) obj;
Object key = e.getKey();
Object val = e.getValue();
boolean isValidName = false;
for (String paramName : READ_VALID_MTD_PARAM_NAMES) {
if (paramName.equals(key))
isValidName = true;
}
if (!isValidName) {
//wrong parameters always rejected
raiseValidateError("MTD file " + mtdFileName + " contains invalid parameter name: " + key, false);
}
// if the read method parameter is a constant, then verify value matches MTD metadata file
if (getVarParam(key.toString()) != null && (getVarParam(key.toString()) instanceof ConstIdentifier) && !getVarParam(key.toString()).toString().equalsIgnoreCase(val.toString())) {
raiseValidateError("parameter " + key.toString() + " has conflicting values in read statement definition and metadata. " + "Config file value: " + val.toString() + " from MTD file. Read statement value: " + getVarParam(key.toString()), conditional);
} else {
// if the read method does not specify parameter value, then add MTD metadata file value to parameter list
if (getVarParam(key.toString()) == null) {
if ((!key.toString().equalsIgnoreCase(DESCRIPTIONPARAM)) && (!key.toString().equalsIgnoreCase(AUTHORPARAM)) && (!key.toString().equalsIgnoreCase(CREATEDPARAM))) {
StringIdentifier strId = new StringIdentifier(val.toString(), this.getFilename(), this.getBeginLine(), this.getBeginColumn(), this.getEndLine(), this.getEndColumn());
if (key.toString().equalsIgnoreCase(DELIM_HAS_HEADER_ROW) || key.toString().equalsIgnoreCase(DELIM_FILL) || key.toString().equalsIgnoreCase(DELIM_SPARSE)) {
// parse these parameters as boolean values
BooleanIdentifier boolId = null;
if (strId.toString().equalsIgnoreCase("true")) {
boolId = new BooleanIdentifier(true, this.getFilename(), this.getBeginLine(), this.getBeginColumn(), this.getEndLine(), this.getEndColumn());
} else if (strId.toString().equalsIgnoreCase("false")) {
boolId = new BooleanIdentifier(false, this.getFilename(), this.getBeginLine(), this.getBeginColumn(), this.getEndLine(), this.getEndColumn());
} else {
raiseValidateError("Invalid value provided for '" + DELIM_HAS_HEADER_ROW + "' in metadata file '" + mtdFileName + "'. " + "Must be either TRUE or FALSE.", conditional);
}
removeVarParam(key.toString());
addVarParam(key.toString(), boolId);
} else if (key.toString().equalsIgnoreCase(DELIM_FILL_VALUE)) {
// parse these parameters as numeric values
DoubleIdentifier doubleId = new DoubleIdentifier(Double.parseDouble(strId.toString()), this.getFilename(), this.getBeginLine(), this.getBeginColumn(), this.getEndLine(), this.getEndColumn());
removeVarParam(key.toString());
addVarParam(key.toString(), doubleId);
} else if (key.toString().equalsIgnoreCase(DELIM_NA_STRINGS)) {
String naStrings = null;
if (val instanceof String) {
naStrings = val.toString();
} else {
StringBuilder sb = new StringBuilder();
JSONArray valarr = (JSONArray) val;
for (int naid = 0; naid < valarr.size(); naid++) {
sb.append((String) valarr.get(naid));
if (naid < valarr.size() - 1)
sb.append(DELIM_NA_STRING_SEP);
}
naStrings = sb.toString();
}
StringIdentifier sid = new StringIdentifier(naStrings, this.getFilename(), this.getBeginLine(), this.getBeginColumn(), this.getEndLine(), this.getEndColumn());
removeVarParam(key.toString());
addVarParam(key.toString(), sid);
} else {
// by default, treat a parameter as a string
addVarParam(key.toString(), strId);
}
}
}
}
}
}
use of org.apache.wink.json4j.JSONArray in project incubator-systemml by apache.
the class ScalingTest method generateSpecFile.
// ----------------------------
private void generateSpecFile(int cols, String specFile) throws IOException, Exception {
final String NAME = "name";
final String METHOD = "method";
final String SCALE_METHOD_Z = "z-score";
final String SCALE_METHOD_M = "mean-subtraction";
JSONObject outputSpec = new JSONObject();
JSONArray scaleSpec = new JSONArray();
for (int colID = 1; colID <= cols; colID++) {
JSONObject obj = new JSONObject();
obj.put(NAME, "V" + colID);
if (colID <= cols / 2)
obj.put(METHOD, SCALE_METHOD_M);
else
obj.put(METHOD, SCALE_METHOD_Z);
scaleSpec.add(obj);
}
outputSpec.put(TfUtils.TXMETHOD_SCALE, scaleSpec);
FileSystem fs = IOUtilFunctions.getFileSystem(specFile);
try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(specFile), true)))) {
out.write(outputSpec.toString());
}
}
use of org.apache.wink.json4j.JSONArray in project incubator-systemml by apache.
the class TfMetaUtils method parseJsonIDList.
/**
* TODO consolidate external and internal json spec definitions
*
* @param spec transform specification as json string
* @param colnames column names
* @param group ?
* @return list of column ids
* @throws JSONException if JSONException occurs
*/
public static int[] parseJsonIDList(JSONObject spec, String[] colnames, String group) throws JSONException {
int[] colList = new int[0];
boolean ids = spec.containsKey("ids") && spec.getBoolean("ids");
if (spec.containsKey(group)) {
//parse attribute-array or plain array of IDs
JSONArray attrs = null;
if (spec.get(group) instanceof JSONObject) {
attrs = (JSONArray) ((JSONObject) spec.get(group)).get(TfUtils.JSON_ATTRS);
//file-based transform outputs ids w/o id tags
ids = true;
} else
attrs = (JSONArray) spec.get(group);
//construct ID list array
colList = new int[attrs.size()];
for (int i = 0; i < colList.length; i++) {
colList[i] = ids ? UtilFunctions.toInt(attrs.get(i)) : (ArrayUtils.indexOf(colnames, attrs.get(i)) + 1);
if (colList[i] <= 0) {
throw new RuntimeException("Specified column '" + attrs.get(i) + "' does not exist.");
}
}
//ensure ascending order of column IDs
Arrays.sort(colList);
}
return colList;
}
Aggregations