use of com.google.refine.model.ReconType in project OpenRefine by OpenRefine.
the class GuessTypesOfColumnCommand method guessTypes.
/**
* Run relevance searches for the first n cells in the given column and
* count the types of the results. Return a sorted list of types, from most
* frequent to least.
*
* @param project
* @param column
* @return
* @throws JSONException, IOException
*/
protected List<TypeGroup> guessTypes(Project project, Column column, String serviceUrl) throws IOException {
Map<String, TypeGroup> map = new HashMap<String, TypeGroup>();
int cellIndex = column.getCellIndex();
List<String> samples = new ArrayList<String>(sampleSize);
Set<String> sampleSet = new HashSet<String>();
for (Row row : project.rows) {
Object value = row.getCellValue(cellIndex);
if (ExpressionUtils.isNonBlankData(value)) {
String s = value.toString().trim();
if (!sampleSet.contains(s)) {
samples.add(s);
sampleSet.add(s);
if (samples.size() >= sampleSize) {
break;
}
}
}
}
Map<String, IndividualQuery> queryMap = new HashMap<>();
for (int i = 0; i < samples.size(); i++) {
queryMap.put("q" + i, new IndividualQuery(samples.get(i), 3));
}
String queriesString = ParsingUtilities.defaultWriter.writeValueAsString(queryMap);
String responseString;
try {
responseString = postQueries(serviceUrl, queriesString);
ObjectNode o = ParsingUtilities.evaluateJsonStringToObjectNode(responseString);
Iterator<JsonNode> iterator = o.iterator();
while (iterator.hasNext()) {
JsonNode o2 = iterator.next();
if (!(o2.has("result") && o2.get("result") instanceof ArrayNode)) {
continue;
}
ArrayNode results = (ArrayNode) o2.get("result");
List<ReconResult> reconResults = ParsingUtilities.mapper.convertValue(results, new TypeReference<List<ReconResult>>() {
});
int count = reconResults.size();
for (int j = 0; j < count; j++) {
ReconResult result = reconResults.get(j);
// score by each result's rank
double score = 1.0 / (1 + j);
List<ReconType> types = result.types;
int typeCount = types.size();
for (int t = 0; t < typeCount; t++) {
ReconType type = types.get(t);
double score2 = score * (typeCount - t) / typeCount;
if (map.containsKey(type.id)) {
TypeGroup tg = map.get(type.id);
tg.score += score2;
tg.count++;
} else {
map.put(type.id, new TypeGroup(type.id, type.name, score2));
}
}
}
}
} catch (IOException e) {
logger.error("Failed to guess cell types for load\n" + queriesString, e);
throw e;
}
List<TypeGroup> types = new ArrayList<TypeGroup>(map.values());
Collections.sort(types, new Comparator<TypeGroup>() {
@Override
public int compare(TypeGroup o1, TypeGroup o2) {
int c = Math.min(sampleSize, o2.count) - Math.min(sampleSize, o1.count);
if (c != 0) {
return c;
}
return (int) Math.signum(o2.score / o2.count - o1.score / o1.count);
}
});
return types;
}
use of com.google.refine.model.ReconType in project OpenRefine by OpenRefine.
the class DataExtensionChange method save.
@Override
public void save(Writer writer, Properties options) throws IOException {
writer.write("baseColumnName=");
writer.write(_baseColumnName);
writer.write('\n');
writer.write("service=");
writer.write(_service);
writer.write('\n');
writer.write("identifierSpace=");
writer.write(_identifierSpace);
writer.write('\n');
writer.write("schemaSpace=");
writer.write(_schemaSpace);
writer.write('\n');
writer.write("columnInsertIndex=");
writer.write(Integer.toString(_columnInsertIndex));
writer.write('\n');
writer.write("columnNameCount=");
writer.write(Integer.toString(_columnNames.size()));
writer.write('\n');
for (String name : _columnNames) {
writer.write(name);
writer.write('\n');
}
writer.write("columnTypeCount=");
writer.write(Integer.toString(_columnTypes.size()));
writer.write('\n');
for (ReconType type : _columnTypes) {
if (type != null) {
ParsingUtilities.defaultWriter.writeValue(writer, type);
}
writer.write('\n');
}
writer.write("rowIndexCount=");
writer.write(Integer.toString(_rowIndices.size()));
writer.write('\n');
for (Integer rowIndex : _rowIndices) {
writer.write(rowIndex.toString());
writer.write('\n');
}
writer.write("firstNewCellIndex=");
writer.write(Integer.toString(_firstNewCellIndex));
writer.write('\n');
writer.write("newRowCount=");
writer.write(Integer.toString(_newRows.size()));
writer.write('\n');
for (Row row : _newRows) {
row.save(writer, options);
writer.write('\n');
}
writer.write("oldRowCount=");
writer.write(Integer.toString(_oldRows.size()));
writer.write('\n');
for (Row row : _oldRows) {
row.save(writer, options);
writer.write('\n');
}
// end of change marker
writer.write("/ec/\n");
}
use of com.google.refine.model.ReconType in project OpenRefine by OpenRefine.
the class DataExtensionChange method apply.
@Override
public void apply(Project project) {
synchronized (project) {
if (_firstNewCellIndex < 0) {
_firstNewCellIndex = project.columnModel.allocateNewCellIndex();
for (int i = 1; i < _columnNames.size(); i++) {
project.columnModel.allocateNewCellIndex();
}
_oldRows = new ArrayList<Row>(project.rows);
_newRows = new ArrayList<Row>(project.rows.size());
int cellIndex = project.columnModel.getColumnByName(_baseColumnName).getCellIndex();
int keyCellIndex = project.columnModel.columns.get(project.columnModel.getKeyColumnIndex()).getCellIndex();
int index = 0;
int rowIndex = index < _rowIndices.size() ? _rowIndices.get(index) : _oldRows.size();
DataExtension dataExtension = index < _rowIndices.size() ? _dataExtensions.get(index) : null;
index++;
Map<String, Recon> reconMap = new HashMap<String, Recon>();
for (int r = 0; r < _oldRows.size(); r++) {
Row oldRow = _oldRows.get(r);
if (r < rowIndex) {
_newRows.add(oldRow.dup());
continue;
}
if (dataExtension == null || dataExtension.data.length == 0) {
_newRows.add(oldRow);
} else {
Row firstNewRow = oldRow.dup();
extendRow(firstNewRow, dataExtension, 0, reconMap);
_newRows.add(firstNewRow);
int r2 = r + 1;
for (int subR = 1; subR < dataExtension.data.length; subR++) {
if (r2 < project.rows.size()) {
Row oldRow2 = project.rows.get(r2);
if (oldRow2.isCellBlank(cellIndex) && oldRow2.isCellBlank(keyCellIndex)) {
Row newRow = oldRow2.dup();
extendRow(newRow, dataExtension, subR, reconMap);
_newRows.add(newRow);
r2++;
continue;
}
}
Row newRow = new Row(cellIndex + _columnNames.size());
extendRow(newRow, dataExtension, subR, reconMap);
_newRows.add(newRow);
}
// r will be incremented by the for loop anyway
r = r2 - 1;
}
rowIndex = index < _rowIndices.size() ? _rowIndices.get(index) : _oldRows.size();
dataExtension = index < _rowIndices.size() ? _dataExtensions.get(index) : null;
index++;
}
}
project.rows.clear();
project.rows.addAll(_newRows);
for (int i = 0; i < _columnNames.size(); i++) {
String name = _columnNames.get(i);
int cellIndex = _firstNewCellIndex + i;
Column column = new Column(cellIndex, name);
ReconType columnType = _columnTypes.get(i);
column.setReconConfig(new DataExtensionReconConfig(_service, _identifierSpace, _schemaSpace, columnType));
ReconStats reconStats = ReconStats.create(project, cellIndex);
if (reconStats.matchedTopics > 0) {
column.setReconStats(reconStats);
}
try {
project.columnModel.addColumn(_columnInsertIndex + i, column, true);
// the column might have been renamed to avoid collision
_columnNames.set(i, column.getName());
} catch (ModelException e) {
// won't get here since we set the avoid collision flag
}
}
project.update();
}
}
Aggregations