Search in sources :

Example 1 with ReconType

use of com.google.refine.model.ReconType in project OpenRefine by OpenRefine.

the class GuessTypesOfColumnCommand method guessTypes.

/**
 * Run relevance searches for the first n cells in the given column and
 * count the types of the results. Return a sorted list of types, from most
 * frequent to least.
 *
 * @param project
 * @param column
 * @return
 * @throws JSONException, IOException
 */
protected List<TypeGroup> guessTypes(Project project, Column column, String serviceUrl) throws IOException {
    Map<String, TypeGroup> map = new HashMap<String, TypeGroup>();
    int cellIndex = column.getCellIndex();
    List<String> samples = new ArrayList<String>(sampleSize);
    Set<String> sampleSet = new HashSet<String>();
    for (Row row : project.rows) {
        Object value = row.getCellValue(cellIndex);
        if (ExpressionUtils.isNonBlankData(value)) {
            String s = value.toString().trim();
            if (!sampleSet.contains(s)) {
                samples.add(s);
                sampleSet.add(s);
                if (samples.size() >= sampleSize) {
                    break;
                }
            }
        }
    }
    Map<String, IndividualQuery> queryMap = new HashMap<>();
    for (int i = 0; i < samples.size(); i++) {
        queryMap.put("q" + i, new IndividualQuery(samples.get(i), 3));
    }
    String queriesString = ParsingUtilities.defaultWriter.writeValueAsString(queryMap);
    String responseString;
    try {
        responseString = postQueries(serviceUrl, queriesString);
        ObjectNode o = ParsingUtilities.evaluateJsonStringToObjectNode(responseString);
        Iterator<JsonNode> iterator = o.iterator();
        while (iterator.hasNext()) {
            JsonNode o2 = iterator.next();
            if (!(o2.has("result") && o2.get("result") instanceof ArrayNode)) {
                continue;
            }
            ArrayNode results = (ArrayNode) o2.get("result");
            List<ReconResult> reconResults = ParsingUtilities.mapper.convertValue(results, new TypeReference<List<ReconResult>>() {
            });
            int count = reconResults.size();
            for (int j = 0; j < count; j++) {
                ReconResult result = reconResults.get(j);
                // score by each result's rank
                double score = 1.0 / (1 + j);
                List<ReconType> types = result.types;
                int typeCount = types.size();
                for (int t = 0; t < typeCount; t++) {
                    ReconType type = types.get(t);
                    double score2 = score * (typeCount - t) / typeCount;
                    if (map.containsKey(type.id)) {
                        TypeGroup tg = map.get(type.id);
                        tg.score += score2;
                        tg.count++;
                    } else {
                        map.put(type.id, new TypeGroup(type.id, type.name, score2));
                    }
                }
            }
        }
    } catch (IOException e) {
        logger.error("Failed to guess cell types for load\n" + queriesString, e);
        throw e;
    }
    List<TypeGroup> types = new ArrayList<TypeGroup>(map.values());
    Collections.sort(types, new Comparator<TypeGroup>() {

        @Override
        public int compare(TypeGroup o1, TypeGroup o2) {
            int c = Math.min(sampleSize, o2.count) - Math.min(sampleSize, o1.count);
            if (c != 0) {
                return c;
            }
            return (int) Math.signum(o2.score / o2.count - o1.score / o1.count);
        }
    });
    return types;
}
Also used : ReconType(com.google.refine.model.ReconType) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) JsonNode(com.fasterxml.jackson.databind.JsonNode) ArrayList(java.util.ArrayList) List(java.util.List) ArrayNode(com.fasterxml.jackson.databind.node.ArrayNode) HashSet(java.util.HashSet) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) IOException(java.io.IOException) ReconResult(com.google.refine.model.recon.StandardReconConfig.ReconResult) Row(com.google.refine.model.Row)

Example 2 with ReconType

use of com.google.refine.model.ReconType in project OpenRefine by OpenRefine.

the class DataExtensionChange method save.

@Override
public void save(Writer writer, Properties options) throws IOException {
    writer.write("baseColumnName=");
    writer.write(_baseColumnName);
    writer.write('\n');
    writer.write("service=");
    writer.write(_service);
    writer.write('\n');
    writer.write("identifierSpace=");
    writer.write(_identifierSpace);
    writer.write('\n');
    writer.write("schemaSpace=");
    writer.write(_schemaSpace);
    writer.write('\n');
    writer.write("columnInsertIndex=");
    writer.write(Integer.toString(_columnInsertIndex));
    writer.write('\n');
    writer.write("columnNameCount=");
    writer.write(Integer.toString(_columnNames.size()));
    writer.write('\n');
    for (String name : _columnNames) {
        writer.write(name);
        writer.write('\n');
    }
    writer.write("columnTypeCount=");
    writer.write(Integer.toString(_columnTypes.size()));
    writer.write('\n');
    for (ReconType type : _columnTypes) {
        if (type != null) {
            ParsingUtilities.defaultWriter.writeValue(writer, type);
        }
        writer.write('\n');
    }
    writer.write("rowIndexCount=");
    writer.write(Integer.toString(_rowIndices.size()));
    writer.write('\n');
    for (Integer rowIndex : _rowIndices) {
        writer.write(rowIndex.toString());
        writer.write('\n');
    }
    writer.write("firstNewCellIndex=");
    writer.write(Integer.toString(_firstNewCellIndex));
    writer.write('\n');
    writer.write("newRowCount=");
    writer.write(Integer.toString(_newRows.size()));
    writer.write('\n');
    for (Row row : _newRows) {
        row.save(writer, options);
        writer.write('\n');
    }
    writer.write("oldRowCount=");
    writer.write(Integer.toString(_oldRows.size()));
    writer.write('\n');
    for (Row row : _oldRows) {
        row.save(writer, options);
        writer.write('\n');
    }
    // end of change marker
    writer.write("/ec/\n");
}
Also used : ReconType(com.google.refine.model.ReconType) Row(com.google.refine.model.Row)

Example 3 with ReconType

use of com.google.refine.model.ReconType in project OpenRefine by OpenRefine.

the class DataExtensionChange method apply.

@Override
public void apply(Project project) {
    synchronized (project) {
        if (_firstNewCellIndex < 0) {
            _firstNewCellIndex = project.columnModel.allocateNewCellIndex();
            for (int i = 1; i < _columnNames.size(); i++) {
                project.columnModel.allocateNewCellIndex();
            }
            _oldRows = new ArrayList<Row>(project.rows);
            _newRows = new ArrayList<Row>(project.rows.size());
            int cellIndex = project.columnModel.getColumnByName(_baseColumnName).getCellIndex();
            int keyCellIndex = project.columnModel.columns.get(project.columnModel.getKeyColumnIndex()).getCellIndex();
            int index = 0;
            int rowIndex = index < _rowIndices.size() ? _rowIndices.get(index) : _oldRows.size();
            DataExtension dataExtension = index < _rowIndices.size() ? _dataExtensions.get(index) : null;
            index++;
            Map<String, Recon> reconMap = new HashMap<String, Recon>();
            for (int r = 0; r < _oldRows.size(); r++) {
                Row oldRow = _oldRows.get(r);
                if (r < rowIndex) {
                    _newRows.add(oldRow.dup());
                    continue;
                }
                if (dataExtension == null || dataExtension.data.length == 0) {
                    _newRows.add(oldRow);
                } else {
                    Row firstNewRow = oldRow.dup();
                    extendRow(firstNewRow, dataExtension, 0, reconMap);
                    _newRows.add(firstNewRow);
                    int r2 = r + 1;
                    for (int subR = 1; subR < dataExtension.data.length; subR++) {
                        if (r2 < project.rows.size()) {
                            Row oldRow2 = project.rows.get(r2);
                            if (oldRow2.isCellBlank(cellIndex) && oldRow2.isCellBlank(keyCellIndex)) {
                                Row newRow = oldRow2.dup();
                                extendRow(newRow, dataExtension, subR, reconMap);
                                _newRows.add(newRow);
                                r2++;
                                continue;
                            }
                        }
                        Row newRow = new Row(cellIndex + _columnNames.size());
                        extendRow(newRow, dataExtension, subR, reconMap);
                        _newRows.add(newRow);
                    }
                    // r will be incremented by the for loop anyway
                    r = r2 - 1;
                }
                rowIndex = index < _rowIndices.size() ? _rowIndices.get(index) : _oldRows.size();
                dataExtension = index < _rowIndices.size() ? _dataExtensions.get(index) : null;
                index++;
            }
        }
        project.rows.clear();
        project.rows.addAll(_newRows);
        for (int i = 0; i < _columnNames.size(); i++) {
            String name = _columnNames.get(i);
            int cellIndex = _firstNewCellIndex + i;
            Column column = new Column(cellIndex, name);
            ReconType columnType = _columnTypes.get(i);
            column.setReconConfig(new DataExtensionReconConfig(_service, _identifierSpace, _schemaSpace, columnType));
            ReconStats reconStats = ReconStats.create(project, cellIndex);
            if (reconStats.matchedTopics > 0) {
                column.setReconStats(reconStats);
            }
            try {
                project.columnModel.addColumn(_columnInsertIndex + i, column, true);
                // the column might have been renamed to avoid collision
                _columnNames.set(i, column.getName());
            } catch (ModelException e) {
            // won't get here since we set the avoid collision flag
            }
        }
        project.update();
    }
}
Also used : ReconType(com.google.refine.model.ReconType) DataExtensionReconConfig(com.google.refine.model.recon.DataExtensionReconConfig) ModelException(com.google.refine.model.ModelException) HashMap(java.util.HashMap) DataExtension(com.google.refine.model.recon.ReconciledDataExtensionJob.DataExtension) Column(com.google.refine.model.Column) ReconStats(com.google.refine.model.ReconStats) Row(com.google.refine.model.Row) Recon(com.google.refine.model.Recon)

Aggregations

ReconType (com.google.refine.model.ReconType)3 Row (com.google.refine.model.Row)3 HashMap (java.util.HashMap)2 JsonNode (com.fasterxml.jackson.databind.JsonNode)1 ArrayNode (com.fasterxml.jackson.databind.node.ArrayNode)1 ObjectNode (com.fasterxml.jackson.databind.node.ObjectNode)1 Column (com.google.refine.model.Column)1 ModelException (com.google.refine.model.ModelException)1 Recon (com.google.refine.model.Recon)1 ReconStats (com.google.refine.model.ReconStats)1 DataExtensionReconConfig (com.google.refine.model.recon.DataExtensionReconConfig)1 DataExtension (com.google.refine.model.recon.ReconciledDataExtensionJob.DataExtension)1 ReconResult (com.google.refine.model.recon.StandardReconConfig.ReconResult)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 List (java.util.List)1