Search in sources :

Example 51 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class CsvInputFormatTest method readStringFieldsWithTrailingDelimiters.

@Test
public void readStringFieldsWithTrailingDelimiters() {
    try {
        final String fileContent = "abc|-def|-ghijk\nabc|-|-hhg\n|-|-|-\n";
        final FileInputSplit split = createTempFile(fileContent);
        final TupleTypeInfo<Tuple3<String, String, String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class, String.class);
        final CsvInputFormat<Tuple3<String, String, String>> format = new TupleCsvInputFormat<Tuple3<String, String, String>>(PATH, typeInfo);
        format.setFieldDelimiter("|-");
        format.configure(new Configuration());
        format.open(split);
        Tuple3<String, String, String> result = new Tuple3<String, String, String>();
        result = format.nextRecord(result);
        assertNotNull(result);
        assertEquals("abc", result.f0);
        assertEquals("def", result.f1);
        assertEquals("ghijk", result.f2);
        result = format.nextRecord(result);
        assertNotNull(result);
        assertEquals("abc", result.f0);
        assertEquals("", result.f1);
        assertEquals("hhg", result.f2);
        result = format.nextRecord(result);
        assertNotNull(result);
        assertEquals("", result.f0);
        assertEquals("", result.f1);
        assertEquals("", result.f2);
        result = format.nextRecord(result);
        assertNull(result);
        assertTrue(format.reachedEnd());
    } catch (Exception ex) {
        fail("Test failed due to a " + ex.getClass().getName() + ": " + ex.getMessage());
    }
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Configuration(org.apache.flink.configuration.Configuration) IOException(java.io.IOException) ParseException(org.apache.flink.api.common.io.ParseException) Test(org.junit.Test)

Example 52 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class CsvInputFormatTest method testPojoTypeWithPartialFieldInCSV.

@Test
public void testPojoTypeWithPartialFieldInCSV() throws Exception {
    File tempFile = File.createTempFile("CsvReaderPojoType", "tmp");
    tempFile.deleteOnExit();
    tempFile.setWritable(true);
    OutputStreamWriter wrt = new OutputStreamWriter(new FileOutputStream(tempFile));
    wrt.write("123,NODATA,AAA,NODATA,3.123,BBB\n");
    wrt.write("456,NODATA,BBB,NODATA,1.123,AAA\n");
    wrt.close();
    @SuppressWarnings("unchecked") PojoTypeInfo<PojoItem> typeInfo = (PojoTypeInfo<PojoItem>) TypeExtractor.createTypeInfo(PojoItem.class);
    CsvInputFormat<PojoItem> inputFormat = new PojoCsvInputFormat<PojoItem>(new Path(tempFile.toURI().toString()), typeInfo, new boolean[] { true, false, true, false, true, true });
    inputFormat.configure(new Configuration());
    FileInputSplit[] splits = inputFormat.createInputSplits(1);
    inputFormat.open(splits[0]);
    validatePojoItem(inputFormat);
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) PojoTypeInfo(org.apache.flink.api.java.typeutils.PojoTypeInfo) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) Test(org.junit.Test)

Example 53 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class CsvInputFormatTest method testQuotedStringParsingWithIncludeFields.

@Test
public void testQuotedStringParsingWithIncludeFields() throws Exception {
    final String fileContent = "\"20:41:52-1-3-2015\"|\"Re: Taskmanager memory error in Eclipse\"|" + "\"Blahblah <blah@blahblah.org>\"|\"blaaa|\"blubb\"";
    final File tempFile = File.createTempFile("CsvReaderQuotedString", "tmp");
    tempFile.deleteOnExit();
    tempFile.setWritable(true);
    OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tempFile));
    writer.write(fileContent);
    writer.close();
    TupleTypeInfo<Tuple2<String, String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class);
    CsvInputFormat<Tuple2<String, String>> inputFormat = new TupleCsvInputFormat<Tuple2<String, String>>(new Path(tempFile.toURI().toString()), typeInfo, new boolean[] { true, false, true });
    inputFormat.enableQuotedStringParsing('"');
    inputFormat.setFieldDelimiter("|");
    inputFormat.setDelimiter('\n');
    inputFormat.configure(new Configuration());
    FileInputSplit[] splits = inputFormat.createInputSplits(1);
    inputFormat.open(splits[0]);
    Tuple2<String, String> record = inputFormat.nextRecord(new Tuple2<String, String>());
    assertEquals("20:41:52-1-3-2015", record.f0);
    assertEquals("Blahblah <blah@blahblah.org>", record.f1);
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) Test(org.junit.Test)

Example 54 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class ContinuousFileReaderOperator method restoreState.

// ------------------------------------------------------------------------
//  Restoring / Migrating from an older Flink version.
// ------------------------------------------------------------------------
@Override
public void restoreState(FSDataInputStream in) throws Exception {
    LOG.info("{} (taskIdx={}) restoring state from an older Flink version.", getClass().getSimpleName(), getRuntimeContext().getIndexOfThisSubtask());
    // this is just to read the byte indicating if we have udf state or not
    int hasUdfState = in.read();
    Preconditions.checkArgument(hasUdfState == 0);
    final ObjectInputStream ois = new ObjectInputStream(in);
    final DataInputViewStreamWrapper div = new DataInputViewStreamWrapper(in);
    // read the split that was being read
    FileInputSplit currSplit = (FileInputSplit) ois.readObject();
    // read the pending splits list
    List<FileInputSplit> pendingSplits = new LinkedList<>();
    int noOfSplits = div.readInt();
    for (int i = 0; i < noOfSplits; i++) {
        FileInputSplit split = (FileInputSplit) ois.readObject();
        pendingSplits.add(split);
    }
    // read the state of the format
    Serializable formatState = (Serializable) ois.readObject();
    div.close();
    if (restoredReaderState == null) {
        restoredReaderState = new ArrayList<>();
    }
    // we do not know the modification time of the retrieved splits, so we assign them
    // artificial ones, with the only constraint that they respect the relative order of the
    // retrieved splits, because modification time is going to be used to sort the splits within
    // the "pending splits" priority queue.
    long now = getProcessingTimeService().getCurrentProcessingTime();
    long runningModTime = Math.max(now, noOfSplits + 1);
    TimestampedFileInputSplit currentSplit = createTimestampedFileSplit(currSplit, --runningModTime, formatState);
    restoredReaderState.add(currentSplit);
    for (FileInputSplit split : pendingSplits) {
        TimestampedFileInputSplit timestampedSplit = createTimestampedFileSplit(split, --runningModTime);
        restoredReaderState.add(timestampedSplit);
    }
    if (LOG.isDebugEnabled()) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("{} (taskIdx={}) restored {} splits from legacy: {}.", getClass().getSimpleName(), getRuntimeContext().getIndexOfThisSubtask(), restoredReaderState.size(), restoredReaderState);
        }
    }
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Serializable(java.io.Serializable) DataInputViewStreamWrapper(org.apache.flink.core.memory.DataInputViewStreamWrapper) LinkedList(java.util.LinkedList) ObjectInputStream(java.io.ObjectInputStream)

Example 55 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class ContinuousFileMonitoringFunction method getInputSplitsSortedByModTime.

/**
	 * Creates the input splits to be forwarded to the downstream tasks of the
	 * {@link ContinuousFileReaderOperator}. Splits are sorted <b>by modification time</b> before
	 * being forwarded and only splits belonging to files in the {@code eligibleFiles}
	 * list will be processed.
	 * @param eligibleFiles The files to process.
	 */
private Map<Long, List<TimestampedFileInputSplit>> getInputSplitsSortedByModTime(Map<Path, FileStatus> eligibleFiles) throws IOException {
    Map<Long, List<TimestampedFileInputSplit>> splitsByModTime = new TreeMap<>();
    if (eligibleFiles.isEmpty()) {
        return splitsByModTime;
    }
    for (FileInputSplit split : format.createInputSplits(readerParallelism)) {
        FileStatus fileStatus = eligibleFiles.get(split.getPath());
        if (fileStatus != null) {
            Long modTime = fileStatus.getModificationTime();
            List<TimestampedFileInputSplit> splitsToForward = splitsByModTime.get(modTime);
            if (splitsToForward == null) {
                splitsToForward = new ArrayList<>();
                splitsByModTime.put(modTime, splitsToForward);
            }
            splitsToForward.add(new TimestampedFileInputSplit(modTime, split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames()));
        }
    }
    return splitsByModTime;
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileStatus(org.apache.flink.core.fs.FileStatus) ArrayList(java.util.ArrayList) List(java.util.List) TreeMap(java.util.TreeMap)

Aggregations

FileInputSplit (org.apache.flink.core.fs.FileInputSplit)140 Test (org.junit.Test)119 Configuration (org.apache.flink.configuration.Configuration)93 Path (org.apache.flink.core.fs.Path)59 IOException (java.io.IOException)45 File (java.io.File)36 FileOutputStream (java.io.FileOutputStream)23 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)20 Row (org.apache.flink.types.Row)20 OutputStreamWriter (java.io.OutputStreamWriter)18 ParseException (org.apache.flink.api.common.io.ParseException)17 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)17 DoubleValue (org.apache.flink.types.DoubleValue)17 IntValue (org.apache.flink.types.IntValue)17 LongValue (org.apache.flink.types.LongValue)17 StringValue (org.apache.flink.types.StringValue)17 Value (org.apache.flink.types.Value)17 Plan (org.apache.flink.api.common.Plan)12 ReplicatingInputFormat (org.apache.flink.api.common.io.ReplicatingInputFormat)12 Tuple1 (org.apache.flink.api.java.tuple.Tuple1)12