Search in sources :

Example 31 with Cell

use of com.google.refine.model.Cell in project OpenRefine by OpenRefine.

the class ExcelImporter method parseOneFile.

@Override
public void parseOneFile(Project project, ProjectMetadata metadata, ImportingJob job, String fileSource, InputStream inputStream, int limit, JSONObject options, List<Exception> exceptions) {
    Workbook wb = null;
    if (!inputStream.markSupported()) {
        inputStream = new PushbackInputStream(inputStream, 8);
    }
    try {
        wb = POIXMLDocument.hasOOXMLHeader(inputStream) ? new XSSFWorkbook(inputStream) : new HSSFWorkbook(new POIFSFileSystem(inputStream));
    } catch (IOException e) {
        exceptions.add(new ImportException("Attempted to parse as an Excel file but failed. " + "Try to use Excel to re-save the file as a different Excel version or as TSV and upload again.", e));
        return;
    } catch (ArrayIndexOutOfBoundsException e) {
        exceptions.add(new ImportException("Attempted to parse file as an Excel file but failed. " + "This is probably caused by a corrupt excel file, or due to the file having previously been created or saved by a non-Microsoft application. " + "Please try opening the file in Microsoft Excel and resaving it, then try re-uploading the file. " + "See https://issues.apache.org/bugzilla/show_bug.cgi?id=48261 for further details", e));
        return;
    } catch (IllegalArgumentException e) {
        exceptions.add(new ImportException("Attempted to parse as an Excel file but failed. " + "Only Excel 97 and later formats are supported.", e));
        return;
    } catch (POIXMLException e) {
        exceptions.add(new ImportException("Attempted to parse as an Excel file but failed. " + "Invalid XML.", e));
        return;
    }
    int[] sheets = JSONUtilities.getIntArray(options, "sheets");
    for (int sheetIndex : sheets) {
        final Sheet sheet = wb.getSheetAt(sheetIndex);
        final int lastRow = sheet.getLastRowNum();
        TableDataReader dataReader = new TableDataReader() {

            int nextRow = 0;

            Map<String, Recon> reconMap = new HashMap<String, Recon>();

            @Override
            public List<Object> getNextRowOfCells() throws IOException {
                if (nextRow > lastRow) {
                    return null;
                }
                List<Object> cells = new ArrayList<Object>();
                org.apache.poi.ss.usermodel.Row row = sheet.getRow(nextRow++);
                if (row != null) {
                    short lastCell = row.getLastCellNum();
                    for (short cellIndex = 0; cellIndex < lastCell; cellIndex++) {
                        Cell cell = null;
                        org.apache.poi.ss.usermodel.Cell sourceCell = row.getCell(cellIndex);
                        if (sourceCell != null) {
                            cell = extractCell(sourceCell, reconMap);
                        }
                        cells.add(cell);
                    }
                }
                return cells;
            }
        };
        TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource + "#" + sheet.getSheetName(), limit, options, exceptions);
    }
}
Also used : ArrayList(java.util.ArrayList) POIXMLException(org.apache.poi.POIXMLException) PushbackInputStream(java.io.PushbackInputStream) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) Cell(com.google.refine.model.Cell) IOException(java.io.IOException) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) Workbook(org.apache.poi.ss.usermodel.Workbook) HSSFWorkbook(org.apache.poi.hssf.usermodel.HSSFWorkbook) HSSFWorkbook(org.apache.poi.hssf.usermodel.HSSFWorkbook) POIFSFileSystem(org.apache.poi.poifs.filesystem.POIFSFileSystem) JSONObject(org.json.JSONObject) Sheet(org.apache.poi.ss.usermodel.Sheet) Recon(com.google.refine.model.Recon) HashMap(java.util.HashMap) Map(java.util.Map)

Example 32 with Cell

use of com.google.refine.model.Cell in project OpenRefine by OpenRefine.

the class OdsImporter method parseOneFile.

@Override
public void parseOneFile(Project project, ProjectMetadata metadata, ImportingJob job, String fileSource, InputStream inputStream, int limit, JSONObject options, List<Exception> exceptions) {
    OdfDocument odfDoc;
    try {
        odfDoc = OdfDocument.loadDocument(inputStream);
    } catch (Exception e) {
        // Ugh! could they throw any wider exception?
        exceptions.add(e);
        return;
    }
    List<OdfTable> tables = odfDoc.getTableList();
    int[] sheets = JSONUtilities.getIntArray(options, "sheets");
    for (int sheetIndex : sheets) {
        final OdfTable table = tables.get(sheetIndex);
        final int lastRow = table.getRowCount();
        TableDataReader dataReader = new TableDataReader() {

            int nextRow = 0;

            Map<String, Recon> reconMap = new HashMap<String, Recon>();

            @Override
            public List<Object> getNextRowOfCells() throws IOException {
                if (nextRow > lastRow) {
                    return null;
                }
                List<Object> cells = new ArrayList<Object>();
                OdfTableRow row = table.getRowByIndex(nextRow++);
                if (row != null) {
                    int lastCell = row.getCellCount();
                    for (int cellIndex = 0; cellIndex <= lastCell; cellIndex++) {
                        Cell cell = null;
                        OdfTableCell sourceCell = row.getCellByIndex(cellIndex);
                        if (sourceCell != null) {
                            cell = extractCell(sourceCell, reconMap);
                        }
                        cells.add(cell);
                    }
                }
                return cells;
            }
        };
        TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource + "#" + table.getTableName(), limit, options, exceptions);
    }
}
Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) OdfTableCell(org.odftoolkit.odfdom.doc.table.OdfTableCell) OdfDocument(org.odftoolkit.odfdom.doc.OdfDocument) OdfTable(org.odftoolkit.odfdom.doc.table.OdfTable) JSONObject(org.json.JSONObject) Recon(com.google.refine.model.Recon) HashMap(java.util.HashMap) Map(java.util.Map) OdfTableCell(org.odftoolkit.odfdom.doc.table.OdfTableCell) Cell(com.google.refine.model.Cell) OdfTableRow(org.odftoolkit.odfdom.doc.table.OdfTableRow)

Example 33 with Cell

use of com.google.refine.model.Cell in project OpenRefine by OpenRefine.

the class RdfTripleImporter method parseOneFile.

@Override
public void parseOneFile(Project project, ProjectMetadata metadata, ImportingJob job, String fileSource, InputStream input, int limit, JSONObject options, List<Exception> exceptions) {
    Graph graph;
    try {
        switch(mode) {
            case NT:
                graph = rdfReader.parseNTriples(input);
                break;
            case N3:
                graph = rdfReader.parseN3(input);
                break;
            case RDFXML:
                graph = rdfReader.parseRdfXml(input);
                break;
            default:
                throw new IllegalArgumentException("Unknown parsing mode");
        }
    } catch (Exception e) {
        exceptions.add(e);
        return;
    }
    ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE);
    try {
        Map<String, List<Row>> subjectToRows = new LinkedHashMap<String, List<Row>>();
        Column subjectColumn = new Column(project.columnModel.allocateNewCellIndex(), "subject");
        project.columnModel.addColumn(0, subjectColumn, false);
        project.columnModel.setKeyColumnIndex(0);
        for (Triple triple : triples) {
            String subject = triple.getSubject().toString();
            String predicate = triple.getPredicate().toString();
            String object = triple.getObject().toString();
            Column column = project.columnModel.getColumnByName(predicate);
            if (column == null) {
                column = new Column(project.columnModel.allocateNewCellIndex(), predicate);
                project.columnModel.addColumn(-1, column, true);
            }
            int cellIndex = column.getCellIndex();
            if (subjectToRows.containsKey(subject)) {
                List<Row> rows = subjectToRows.get(subject);
                for (Row row : rows) {
                    if (!ExpressionUtils.isNonBlankData(row.getCellValue(cellIndex))) {
                        row.setCell(cellIndex, new Cell(object, null));
                        object = null;
                        break;
                    }
                }
                if (object != null) {
                    Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
                    rows.add(row);
                    row.setCell(cellIndex, new Cell(object, null));
                }
            } else {
                List<Row> rows = new ArrayList<Row>();
                subjectToRows.put(subject, rows);
                Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
                rows.add(row);
                row.setCell(subjectColumn.getCellIndex(), new Cell(subject, null));
                row.setCell(cellIndex, new Cell(object, null));
            }
        }
        for (Entry<String, List<Row>> entry : subjectToRows.entrySet()) {
            project.rows.addAll(entry.getValue());
        }
    } catch (ModelException e) {
        exceptions.add(e);
    } finally {
        triples.iterator().close();
    }
}
Also used : ModelException(com.google.refine.model.ModelException) ArrayList(java.util.ArrayList) ModelException(com.google.refine.model.ModelException) LinkedHashMap(java.util.LinkedHashMap) Triple(org.jrdf.graph.Triple) Graph(org.jrdf.graph.Graph) Column(com.google.refine.model.Column) ArrayList(java.util.ArrayList) List(java.util.List) Row(com.google.refine.model.Row) Cell(com.google.refine.model.Cell)

Example 34 with Cell

use of com.google.refine.model.Cell in project OpenRefine by OpenRefine.

the class TabularImportingParserBase method readTable.

public static void readTable(Project project, ProjectMetadata metadata, ImportingJob job, TableDataReader reader, String fileSource, int limit, JSONObject options, List<Exception> exceptions) {
    int ignoreLines = JSONUtilities.getInt(options, "ignoreLines", -1);
    int headerLines = JSONUtilities.getInt(options, "headerLines", 1);
    int skipDataLines = JSONUtilities.getInt(options, "skipDataLines", 0);
    int limit2 = JSONUtilities.getInt(options, "limit", -1);
    if (limit > 0) {
        if (limit2 > 0) {
            limit2 = Math.min(limit, limit2);
        } else {
            limit2 = limit;
        }
    }
    boolean guessCellValueTypes = JSONUtilities.getBoolean(options, "guessCellValueTypes", false);
    boolean storeBlankRows = JSONUtilities.getBoolean(options, "storeBlankRows", true);
    boolean storeBlankCellsAsNulls = JSONUtilities.getBoolean(options, "storeBlankCellsAsNulls", true);
    boolean includeFileSources = JSONUtilities.getBoolean(options, "includeFileSources", false);
    int filenameColumnIndex = -1;
    if (includeFileSources) {
        filenameColumnIndex = addFilenameColumn(project);
    }
    List<String> columnNames = new ArrayList<String>();
    boolean hasOurOwnColumnNames = headerLines > 0;
    List<Object> cells = null;
    int rowsWithData = 0;
    try {
        while (!job.canceled && (cells = reader.getNextRowOfCells()) != null) {
            if (ignoreLines > 0) {
                ignoreLines--;
                continue;
            }
            if (headerLines > 0) {
                // header lines
                for (int c = 0; c < cells.size(); c++) {
                    Object cell = cells.get(c);
                    String columnName;
                    if (cell == null) {
                        // add column even if cell is blank
                        columnName = "";
                    } else if (cell instanceof Cell) {
                        columnName = ((Cell) cell).value.toString().trim();
                    } else {
                        columnName = cell.toString().trim();
                    }
                    ImporterUtilities.appendColumnName(columnNames, c, columnName);
                }
                headerLines--;
                if (headerLines == 0) {
                    ImporterUtilities.setupColumns(project, columnNames);
                }
            } else {
                // data lines
                Row row = new Row(columnNames.size());
                if (storeBlankRows) {
                    rowsWithData++;
                } else if (cells.size() > 0) {
                    rowsWithData++;
                }
                if (skipDataLines <= 0 || rowsWithData > skipDataLines) {
                    boolean rowHasData = false;
                    for (int c = 0; c < cells.size(); c++) {
                        Column column = ImporterUtilities.getOrAllocateColumn(project, columnNames, c, hasOurOwnColumnNames);
                        Object value = cells.get(c);
                        if (value instanceof Cell) {
                            row.setCell(column.getCellIndex(), (Cell) value);
                            rowHasData = true;
                        } else if (ExpressionUtils.isNonBlankData(value)) {
                            Serializable storedValue;
                            if (value instanceof String) {
                                storedValue = guessCellValueTypes ? ImporterUtilities.parseCellValue((String) value) : (String) value;
                            } else {
                                storedValue = ExpressionUtils.wrapStorable(value);
                            }
                            row.setCell(column.getCellIndex(), new Cell(storedValue, null));
                            rowHasData = true;
                        } else if (!storeBlankCellsAsNulls) {
                            row.setCell(column.getCellIndex(), new Cell("", null));
                        } else {
                            row.setCell(column.getCellIndex(), null);
                        }
                    }
                    if (rowHasData || storeBlankRows) {
                        if (includeFileSources && filenameColumnIndex >= 0) {
                            row.setCell(filenameColumnIndex, new Cell(fileSource, null));
                        }
                        project.rows.add(row);
                    }
                    if (limit2 > 0 && project.rows.size() >= limit2) {
                        break;
                    }
                }
            }
        }
    } catch (IOException e) {
        exceptions.add(e);
    }
}
Also used : Serializable(java.io.Serializable) Column(com.google.refine.model.Column) ArrayList(java.util.ArrayList) JSONObject(org.json.JSONObject) Row(com.google.refine.model.Row) IOException(java.io.IOException) Cell(com.google.refine.model.Cell)

Example 35 with Cell

use of com.google.refine.model.Cell in project OpenRefine by OpenRefine.

the class ExcelImporter method extractCell.

protected static Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map<String, Recon> reconMap) {
    Serializable value = extractCell(cell);
    if (value != null) {
        Recon recon = null;
        Hyperlink hyperlink = cell.getHyperlink();
        if (hyperlink != null) {
            String url = hyperlink.getAddress();
            if (url != null && (url.startsWith("http://") || url.startsWith("https://"))) {
                final String sig = "freebase.com/view";
                int i = url.indexOf(sig);
                if (i > 0) {
                    String id = url.substring(i + sig.length());
                    int q = id.indexOf('?');
                    if (q > 0) {
                        id = id.substring(0, q);
                    }
                    int h = id.indexOf('#');
                    if (h > 0) {
                        id = id.substring(0, h);
                    }
                    if (reconMap.containsKey(id)) {
                        recon = reconMap.get(id);
                        recon.judgmentBatchSize++;
                    } else {
                        recon = new Recon(0, null, null);
                        recon.service = "import";
                        recon.match = new ReconCandidate(id, value.toString(), new String[0], 100);
                        recon.matchRank = 0;
                        recon.judgment = Judgment.Matched;
                        recon.judgmentAction = "auto";
                        recon.judgmentBatchSize = 1;
                        recon.addCandidate(recon.match);
                        reconMap.put(id, recon);
                    }
                }
            }
        }
        return new Cell(value, recon);
    } else {
        return null;
    }
}
Also used : Serializable(java.io.Serializable) Recon(com.google.refine.model.Recon) Cell(com.google.refine.model.Cell) ReconCandidate(com.google.refine.model.ReconCandidate) Hyperlink(org.apache.poi.common.usermodel.Hyperlink)

Aggregations

Cell (com.google.refine.model.Cell)58 Row (com.google.refine.model.Row)36 Column (com.google.refine.model.Column)19 Test (org.testng.annotations.Test)16 RefineTest (com.google.refine.tests.RefineTest)15 BeforeTest (org.testng.annotations.BeforeTest)15 JSONObject (org.json.JSONObject)13 ArrayList (java.util.ArrayList)12 Project (com.google.refine.model.Project)11 IOException (java.io.IOException)11 Properties (java.util.Properties)11 JSONException (org.json.JSONException)9 RowVisitor (com.google.refine.browsing.RowVisitor)7 HistoryEntry (com.google.refine.history.HistoryEntry)7 Serializable (java.io.Serializable)7 Recon (com.google.refine.model.Recon)6 CellChange (com.google.refine.model.changes.CellChange)6 HashMap (java.util.HashMap)6 Evaluable (com.google.refine.expr.Evaluable)5 JSONArray (org.json.JSONArray)4