use of org.knime.core.data.RowIterator in project knime-core by knime.
the class AbstractColumnTableSorter method sortOnDisk.
/**
* Sorts the given data table using a disk-based k-way merge sort.
*
* @param dataTable the data table that sgetRowCounthould be sorted
* @param exec an execution context for reporting progress and creating BufferedDataContainers
* @throws CanceledExecutionException if the user has canceled execution
*/
private void sortOnDisk(final DataTable dataTable, final ExecutionMonitor exec, final SortingConsumer resultListener) throws CanceledExecutionException {
final List<AbstractTableSorter> columnPartitions = new ArrayList<>(m_sortDescriptions.length);
// Each sorting description is done as a single external merge sort of their parts of the data
for (int i = 0; i < m_sortDescriptions.length; i++) {
AbstractTableSorter tableSorter = createTableSorter(m_rowCount, m_sortDescriptions[i].createDataTableSpec(m_dataTableSpec), m_sortDescriptions[i]);
columnPartitions.add(tableSorter);
}
exec.setMessage("Reading table");
RowIterator iterator = dataTable.iterator();
ExecutionMonitor readProgress = exec.createSubProgress(0.7);
// phase one: create as big chunks as possible from the given input table
// for each sort description
int chunkCount = 0;
long currentTotalRows = 0L;
while (iterator.hasNext()) {
LOGGER.debugWithFormat("Reading temporary tables -- (chunk %d)", chunkCount);
assert m_buffer.values().stream().allMatch(l -> l.isEmpty());
long bufferedRows = fillBuffer(iterator, exec);
LOGGER.debugWithFormat("Writing temporary tables -- (chunk %d with %d rows)", chunkCount, bufferedRows);
currentTotalRows += bufferedRows;
readProgress.setProgress(currentTotalRows / (double) m_rowCount, String.format("Writing temporary tables (chunk %d with %d rows)", chunkCount, bufferedRows));
chunkCount += 1;
LOGGER.debugWithFormat("Sorting temporary tables -- (chunk %d with %d rows)", chunkCount, bufferedRows);
sortBufferInParallel();
for (AbstractTableSorter tableSorter : columnPartitions) {
tableSorter.openChunk();
}
LOGGER.debugWithFormat("Writing temporary tables (chunk %d with %d rows)", chunkCount, bufferedRows);
for (int i = 0; i < m_sortDescriptions.length; i++) {
SortingDescription sortingDescription = m_sortDescriptions[i];
LOGGER.debugWithFormat("Writing temporary table (chunk %d, column %d)", chunkCount, i);
AbstractTableSorter tableSorter = columnPartitions.get(i);
ListIterator<DataRow> rowIterator = m_buffer.get(sortingDescription).listIterator();
while (rowIterator.hasNext()) {
tableSorter.addRowToChunk(rowIterator.next());
// release the row as early as possible
rowIterator.set(null);
}
exec.checkCanceled();
}
for (AbstractTableSorter tableSorter : columnPartitions) {
tableSorter.closeChunk();
}
clearBuffer();
}
readProgress.setProgress(1.0);
// phase 2: merge the temporary tables
exec.setMessage("Merging temporary tables.");
ExecutionMonitor mergingProgress = exec.createSubProgress(0.3);
List<Iterator<DataRow>> partitionRowIterators = mergePartitions(columnPartitions, mergingProgress, chunkCount);
// publish the results to the listener
List<DataRow> currentRow = new ArrayList<>();
long rowNo = 0;
Iterator<DataRow> firstPartitionIterator = partitionRowIterators.isEmpty() ? Collections.<DataRow>emptyList().iterator() : partitionRowIterators.get(0);
while (firstPartitionIterator.hasNext()) {
for (int i = 0; i < partitionRowIterators.size(); i++) {
currentRow.add(partitionRowIterators.get(i).next());
}
resultListener.consume(aggregateRows(new RowKey("AutoGenerated" + rowNo++), currentRow));
currentRow.clear();
}
}
use of org.knime.core.data.RowIterator in project knime-core by knime.
the class DataContainerTest method testBigFile.
// testAddRowToTable()
/**
* Try a big file :-).
*/
public void testBigFile() {
// with these setting (50, 1000) it will write an 250MB cache file
// (the latest data this value was checked: 31. August 2006...)
final int colCount = 50;
final int rowCount = 100;
String[] names = new String[colCount];
DataType[] types = new DataType[colCount];
for (int c = 0; c < colCount; c++) {
names[c] = "Column " + c;
switch(c % 3) {
case 0:
types[c] = DoubleCell.TYPE;
break;
case 1:
types[c] = StringCell.TYPE;
break;
case 2:
types[c] = IntCell.TYPE;
break;
default:
throw new InternalError();
}
}
DataTableSpec spec = new DataTableSpec(names, types);
names = null;
types = null;
DataContainer container = new DataContainer(spec);
final ObjectToDataCellConverter conv = new ObjectToDataCellConverter();
final long seed = System.currentTimeMillis();
Random rand = new Random(seed);
for (int i = 0; i < rowCount; i++) {
DataRow row = createRandomRow(i, colCount, rand, conv);
container.addRowToTable(row);
}
container.close();
final Throwable[] throwables = new Throwable[1];
final DataTable table = container.getTable();
Runnable runnable = new Runnable() {
@Override
public void run() {
try {
int i = 0;
Random rand1 = new Random(seed);
for (RowIterator it = table.iterator(); it.hasNext(); i++) {
DataRow row1 = createRandomRow(i, colCount, rand1, conv);
DataRow row2 = it.next();
assertEquals(row1, row2);
}
assertEquals(i, rowCount);
} catch (Throwable t) {
throwables[0] = t;
}
}
};
// Runnable
// make two threads read the buffer (file) concurrently.
Thread t1 = new Thread(runnable);
Thread t2 = new Thread(runnable);
t1.start();
t2.start();
try {
// seems that the event dispatch thread must not release the
// reference to the table, otherwise it is (I guess!!) garbage
// collected: You comment these lines and see the error message.
t1.join();
t2.join();
} catch (InterruptedException ie) {
ie.printStackTrace();
fail();
}
if (throwables[0] != null) {
throw new RuntimeException(throwables[0]);
}
}
use of org.knime.core.data.RowIterator in project knime-core by knime.
the class DataContainerTest method testMemoryAlertWhileWrite.
public void testMemoryAlertWhileWrite() throws Exception {
DataContainer cont = new DataContainer(SPEC_STR_INT_DBL, true, 1000000);
int nrRows = 10;
RowIterator it = generateRows(nrRows);
int i = 0;
for (; i < nrRows / 2; i++) {
cont.addRowToTable(it.next());
}
Buffer buffer = cont.getBuffer();
synchronized (buffer) {
buffer.writeAllRowsFromListToFile();
}
for (; i < nrRows; i++) {
cont.addRowToTable(it.next());
}
cont.close();
RowIterator tableIT = cont.getTable().iterator();
for (RowIterator r = generateRows(nrRows); r.hasNext(); ) {
DataRow expected = r.next();
DataRow actual = tableIT.next();
assertEquals(expected, actual);
}
}
use of org.knime.core.data.RowIterator in project knime-core by knime.
the class DataContainerTest method testRowOrder.
/**
* method being tested: addRowToTable().
*/
public final void testRowOrder() {
// addRow should preserve the order, we try here randomly generated
// IntCells as key (the container puts it in a linked has map)
DataCell[] values = new DataCell[0];
Vector<RowKey> order = new Vector<RowKey>(500);
for (int i = 0; i < 500; i++) {
// fill it - this should be easy to preserve (as the int value
// is also the hash code)
order.add(new RowKey(Integer.toString(i)));
}
// shuffle it - that should screw it up
Collections.shuffle(order);
DataContainer c = new DataContainer(EMPTY_SPEC);
for (RowKey key : order) {
c.addRowToTable(new DefaultRow(key, values));
}
c.close();
DataTable table = c.getTable();
int pos = 0;
for (RowIterator it = table.iterator(); it.hasNext(); pos++) {
DataRow cur = it.next();
assertEquals(cur.getKey().getString(), order.get(pos).getString());
}
assertEquals(pos, order.size());
}
use of org.knime.core.data.RowIterator in project knime-core by knime.
the class DefaultTableTest method createWorkingTables.
// testDefaultTable()
/**
* Creates some new <code>DefaultTable</code> objects that should ...
* theoretically ... do.
*/
private void createWorkingTables() {
// some constructors with (supposedly) no problems
DefaultTable t1 = new DefaultTable(CASE_1_CONTENT, CASE_1_ROWHEADER, CASE_1_COLHEADER);
DataTableSpec t1Spec = t1.getDataTableSpec();
assertEquals(CASE_1_COLHEADER.length, t1Spec.getNumColumns());
// check spec
for (int c = 0; c < CASE_1_COLHEADER.length; c++) {
DataColumnSpec currentColumnSpec = t1Spec.getColumnSpec(c);
String colName = currentColumnSpec.getName().toString();
DataType type = currentColumnSpec.getType();
assertEquals(colName, CASE_1_COLHEADER[c]);
assertTrue(type.isCompatible(StringValue.class));
}
// check content
int r = 0;
for (RowIterator it = t1.iterator(); it.hasNext(); r++) {
DataRow row = it.next();
assertEquals(row.getNumCells(), CASE_1_COLHEADER.length);
for (int i = 0; i < CASE_1_COLHEADER.length; i++) {
StringValue cell = (StringValue) row.getCell(i);
assertEquals(cell.getStringValue(), CASE_1_CONTENT[r][i]);
}
}
// all one-dimensional arrays (meta-info) are optional
new DefaultTable(CASE_1_CONTENT, null, CASE_1_COLHEADER);
new DefaultTable(CASE_1_CONTENT, CASE_1_ROWHEADER, null);
new DefaultTable(CASE_1_CONTENT, CASE_1_ROWHEADER, CASE_1_COLHEADER);
new DefaultTable(CASE_1_CONTENT, null, null);
new DefaultTable(CASE_1_CONTENT, null, CASE_1_COLHEADER);
new DefaultTable(CASE_1_CONTENT, null, null);
}
Aggregations