Search in sources :

Example 1 with ReplicationFactor

use of org.apache.cassandra.spark.data.ReplicationFactor in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testPartialFilterMatch.

@Test
public void testPartialFilterMatch() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < NUM_ROWS; i++) {
                for (int j = 0; j < NUM_COLS; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final TableMetadata metaData = new FourZeroSchemaBuilder(schema.createStmt, schema.keyspace, new ReplicationFactor(ReplicationFactor.ReplicationStrategy.SimpleStrategy, ImmutableMap.of("replication_factor", 1)), partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
        final ByteBuffer key1 = Int32Type.instance.fromString("0");
        final BigInteger token1 = bridge.hash(partitioner, key1);
        final PartitionKeyFilter keyInSSTable = PartitionKeyFilter.create(key1, token1);
        final SparkRangeFilter rangeFilter = SparkRangeFilter.create(Range.closed(token1, token1));
        final ByteBuffer key2 = Int32Type.instance.fromString("55");
        final BigInteger token2 = bridge.hash(partitioner, key2);
        final PartitionKeyFilter keyNotInSSTable = PartitionKeyFilter.create(key2, token2);
        final List<CustomFilter> filters = Arrays.asList(rangeFilter, keyInSSTable, keyNotInSSTable);
        final AtomicBoolean pass = new AtomicBoolean(true);
        final AtomicInteger skipCount = new AtomicInteger(0);
        final Stats stats = new Stats() {

            @Override
            public void skippedPartition(ByteBuffer key, BigInteger token) {
                LOGGER.info("Skipping partition: " + token);
                skipCount.incrementAndGet();
                if (filters.stream().anyMatch(filter -> !filter.skipPartition(key, token))) {
                    LOGGER.info("Should not skip partition: " + token);
                    pass.set(false);
                }
            }
        };
        final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), filters, false, stats);
        final int rows = countAndValidateRows(reader);
        assertTrue(skipCount.get() > 0);
        assertEquals(NUM_COLS, rows);
        // should skip partitions not matching filters
        assertEquals((NUM_ROWS - skipCount.get()) * NUM_COLS, rows);
        assertTrue(pass.get());
    });
}
Also used : Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) ReplicationFactor(org.apache.cassandra.spark.data.ReplicationFactor) TestSchema(org.apache.cassandra.spark.TestSchema) ByteBuffer(java.nio.ByteBuffer) PartitionKeyFilter(org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) CustomFilter(org.apache.cassandra.spark.sparksql.filters.CustomFilter) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Stats(org.apache.cassandra.spark.stats.Stats) BigInteger(java.math.BigInteger) SparkRangeFilter(org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 2 with ReplicationFactor

use of org.apache.cassandra.spark.data.ReplicationFactor in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testIncrementalRepair.

// incremental repair
@Test
public void testIncrementalRepair() {
    runTest((partitioner, dir, bridge) -> {
        final TestSchema schema = TestSchema.basic(bridge);
        final int numSSTables = 4;
        final int numRepaired = 2;
        final int numUnRepaired = numSSTables - numRepaired;
        // write some SSTables
        for (int a = 0; a < numSSTables; a++) {
            final int pos = a * NUM_ROWS;
            TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
                for (int i = pos; i < pos + NUM_ROWS; i++) {
                    for (int j = 0; j < NUM_COLS; j++) {
                        writer.write(i, j, i + j);
                    }
                }
            });
        }
        assertEquals(numSSTables, countSSTables(dir));
        final TableMetadata metaData = new FourZeroSchemaBuilder(schema.createStmt, schema.keyspace, new ReplicationFactor(ReplicationFactor.ReplicationStrategy.SimpleStrategy, ImmutableMap.of("replication_factor", 1)), partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, getFileType(dir, DataLayer.FileType.DATA).collect(Collectors.toList()));
        final AtomicInteger skipCount = new AtomicInteger(0);
        final Stats stats = new Stats() {

            @Override
            public void skippedRepairedSSTable(DataLayer.SSTable ssTable, long repairedAt) {
                skipCount.incrementAndGet();
            }
        };
        // mark some SSTables as repaired
        final Map<DataLayer.SSTable, Boolean> isRepaired = dataLayer.listSSTables().collect(Collectors.toMap(Function.identity(), a -> false));
        int count = 0;
        for (final DataLayer.SSTable ssTable : isRepaired.keySet()) {
            if (count < numRepaired) {
                isRepaired.put(ssTable, true);
                count++;
            }
        }
        final List<FourZeroSSTableReader> primaryReaders = dataLayer.listSSTables().map(ssTable -> openIncrementalReader(metaData, ssTable, stats, true, isRepaired.get(ssTable))).filter(reader -> !reader.ignore()).collect(Collectors.toList());
        final List<FourZeroSSTableReader> nonPrimaryReaders = dataLayer.listSSTables().map(ssTable -> openIncrementalReader(metaData, ssTable, stats, false, isRepaired.get(ssTable))).filter(reader -> !reader.ignore()).collect(Collectors.toList());
        // primary repair replica should read all sstables
        assertEquals(numSSTables, primaryReaders.size());
        // non-primary repair replica should only read unrepaired sstables
        assertEquals(numUnRepaired, nonPrimaryReaders.size());
        for (final FourZeroSSTableReader reader : nonPrimaryReaders) {
            assertFalse(isRepaired.get(reader.sstable()));
        }
        assertEquals(numUnRepaired, skipCount.get());
        final Set<FourZeroSSTableReader> toCompact = Stream.concat(primaryReaders.stream().filter(r -> isRepaired.get(r.sstable())), nonPrimaryReaders.stream()).collect(Collectors.toSet());
        assertEquals(numSSTables, toCompact.size());
        int rowCount = 0;
        boolean[] found = new boolean[numSSTables * NUM_ROWS];
        try (final CompactionStreamScanner scanner = new CompactionStreamScanner(metaData, partitioner, toCompact)) {
            // iterate through CompactionScanner and verify we have all the partition keys we are looking for
            final Rid rid = scanner.getRid();
            while (scanner.hasNext()) {
                scanner.next();
                final int a = rid.getPartitionKey().asIntBuffer().get();
                found[a] = true;
                // extract clustering key value and column name
                final ByteBuffer colBuf = rid.getColumnName();
                final ByteBuffer clusteringKey = ByteBufUtils.readBytesWithShortLength(colBuf);
                colBuf.get();
                final String colName = ByteBufUtils.string(ByteBufUtils.readBytesWithShortLength(colBuf));
                colBuf.get();
                if (StringUtils.isEmpty(colName)) {
                    continue;
                }
                assertEquals("c", colName);
                final int b = clusteringKey.asIntBuffer().get();
                // extract value column
                final int c = rid.getValue().asIntBuffer().get();
                assertEquals(c, a + b);
                rowCount++;
            }
        }
        assertEquals(numSSTables * NUM_ROWS * NUM_COLS, rowCount);
        for (final boolean b : found) {
            assertTrue(b);
        }
    });
}
Also used : TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) Arrays(java.util.Arrays) StringUtils(org.apache.commons.lang.StringUtils) BufferedInputStream(java.io.BufferedInputStream) UnfilteredRowIterator(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.UnfilteredRowIterator) CustomFilter(org.apache.cassandra.spark.sparksql.filters.CustomFilter) LoggerFactory(org.slf4j.LoggerFactory) AbstractRow(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.AbstractRow) SparkRowIterator(org.apache.cassandra.spark.sparksql.SparkRowIterator) ByteBuffer(java.nio.ByteBuffer) DecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.DecoratedKey) PartitionKeyFilter(org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter) Pair(org.apache.commons.lang3.tuple.Pair) ByteBufUtils(org.apache.cassandra.spark.utils.ByteBufUtils) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestUtils(org.apache.cassandra.spark.TestUtils) Map(java.util.Map) NUM_ROWS(org.apache.cassandra.spark.TestUtils.NUM_ROWS) BigInteger(java.math.BigInteger) Path(java.nio.file.Path) SSTablesSupplier(org.apache.cassandra.spark.data.SSTablesSupplier) ImmutableMap(com.google.common.collect.ImmutableMap) Range(com.google.common.collect.Range) Set(java.util.Set) Collectors(java.util.stream.Collectors) List(java.util.List) Stream(java.util.stream.Stream) Assert.assertFalse(org.junit.Assert.assertFalse) TestUtils.getFileType(org.apache.cassandra.spark.TestUtils.getFileType) Optional(java.util.Optional) SparkSSTableReader(org.apache.cassandra.spark.reader.SparkSSTableReader) NotNull(org.jetbrains.annotations.NotNull) Rid(org.apache.cassandra.spark.reader.Rid) Cell(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.Cell) DataInputStream(java.io.DataInputStream) InternalRow(org.apache.spark.sql.catalyst.InternalRow) ReplicationFactor(org.apache.cassandra.spark.data.ReplicationFactor) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ISSTableScanner(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner) Unfiltered(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.Unfiltered) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Function(java.util.function.Function) Int32Type(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.marshal.Int32Type) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Stats(org.apache.cassandra.spark.stats.Stats) ColumnData(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.ColumnData) Descriptor(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.Descriptor) BufferDecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.BufferDecoratedKey) Murmur3Partitioner(org.apache.cassandra.spark.shaded.fourzero.cassandra.dht.Murmur3Partitioner) SparkRangeFilter(org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter) TestUtils.countSSTables(org.apache.cassandra.spark.TestUtils.countSSTables) Logger(org.slf4j.Logger) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) Files(java.nio.file.Files) Assert.assertNotNull(org.junit.Assert.assertNotNull) Assert.assertTrue(org.junit.Assert.assertTrue) IOException(java.io.IOException) Test(org.junit.Test) FileInputStream(java.io.FileInputStream) File(java.io.File) Assert.assertNotEquals(org.junit.Assert.assertNotEquals) DataInputPlus(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.util.DataInputPlus) DataLayer(org.apache.cassandra.spark.data.DataLayer) Partitioner(org.apache.cassandra.spark.data.partitioner.Partitioner) Paths(java.nio.file.Paths) NUM_COLS(org.apache.cassandra.spark.TestUtils.NUM_COLS) TestUtils.getFirstFileType(org.apache.cassandra.spark.TestUtils.getFirstFileType) Collections(java.util.Collections) TestSchema(org.apache.cassandra.spark.TestSchema) Assert.assertEquals(org.junit.Assert.assertEquals) InputStream(java.io.InputStream) ReplicationFactor(org.apache.cassandra.spark.data.ReplicationFactor) TestSchema(org.apache.cassandra.spark.TestSchema) Rid(org.apache.cassandra.spark.reader.Rid) ByteBuffer(java.nio.ByteBuffer) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) DataLayer(org.apache.cassandra.spark.data.DataLayer) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Stats(org.apache.cassandra.spark.stats.Stats) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 3 with ReplicationFactor

use of org.apache.cassandra.spark.data.ReplicationFactor in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testFilterKeyMissingInIndex.

@Test
public void testFilterKeyMissingInIndex() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < NUM_ROWS; i++) {
                for (int j = 0; j < NUM_COLS; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final TableMetadata metaData = new FourZeroSchemaBuilder(schema.createStmt, schema.keyspace, new ReplicationFactor(ReplicationFactor.ReplicationStrategy.SimpleStrategy, ImmutableMap.of("replication_factor", 1)), partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
        final ByteBuffer key1 = Int32Type.instance.fromString("51");
        final BigInteger token1 = bridge.hash(partitioner, key1);
        final PartitionKeyFilter keyNotInSSTable1 = PartitionKeyFilter.create(key1, token1);
        final ByteBuffer key2 = Int32Type.instance.fromString("90");
        final BigInteger token2 = bridge.hash(partitioner, key2);
        final PartitionKeyFilter keyNotInSSTable2 = PartitionKeyFilter.create(key2, token2);
        final List<CustomFilter> filters = Arrays.asList(keyNotInSSTable1, keyNotInSSTable2);
        final AtomicBoolean pass = new AtomicBoolean(true);
        final AtomicInteger skipCount = new AtomicInteger(0);
        final Stats stats = new Stats() {

            @Override
            public void skippedSSTable(List<CustomFilter> filters, BigInteger firstToken, BigInteger lastToken) {
                pass.set(false);
            }

            @Override
            public void missingInIndex() {
                skipCount.incrementAndGet();
                if (filters.size() != 2) {
                    pass.set(false);
                }
            }
        };
        final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), filters, true, stats);
        assertTrue(reader.ignore());
        assertEquals(1, skipCount.get());
        assertTrue(pass.get());
    });
}
Also used : Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) ReplicationFactor(org.apache.cassandra.spark.data.ReplicationFactor) TestSchema(org.apache.cassandra.spark.TestSchema) ByteBuffer(java.nio.ByteBuffer) PartitionKeyFilter(org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) CustomFilter(org.apache.cassandra.spark.sparksql.filters.CustomFilter) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Stats(org.apache.cassandra.spark.stats.Stats) BigInteger(java.math.BigInteger) List(java.util.List) ArrayList(java.util.ArrayList) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 4 with ReplicationFactor

use of org.apache.cassandra.spark.data.ReplicationFactor in project spark-cassandra-bulkreader by jberragan.

the class SchemaBuilderTests method testUdts.

/* user defined types */
@Test
public void testUdts() {
    final ReplicationFactor rf = new ReplicationFactor(ReplicationFactor.ReplicationStrategy.NetworkTopologyStrategy, ImmutableMap.of("DC1", 3, "DC2", 3));
    final String keyspace = "udt_keyspace";
    final String udtName = "udt_name";
    final FourZeroSchemaBuilder builder = new FourZeroSchemaBuilder("CREATE TABLE " + keyspace + ".udt_test (\n" + "    account_id uuid PRIMARY KEY,\n" + "    balance bigint,\n" + "    info " + udtName + ",\n" + "    name text\n" + ");", keyspace, rf, Partitioner.Murmur3Partitioner, toSet("CREATE TYPE " + keyspace + "." + udtName + " (\n" + "  birthday timestamp,\n" + "  nationality text,\n" + "  weight float,\n" + "  height int\n" + ");"));
    final CqlSchema schema = builder.build();
    assertEquals(1, schema.udts().size());
    final CqlField.CqlUdt udt = schema.udts().stream().findFirst().get();
    assertEquals(udtName, udt.name());
    final List<CqlField> udtFields = udt.fields();
    assertEquals(4, udtFields.size());
    assertEquals(bridge.timestamp(), udtFields.get(0).type());
    assertEquals(bridge.text(), udtFields.get(1).type());
    assertEquals(bridge.aFloat(), udtFields.get(2).type());
    assertEquals(bridge.aInt(), udtFields.get(3).type());
    final List<CqlField> fields = schema.fields();
    assertEquals(bridge.uuid(), fields.get(0).type());
    assertEquals(bridge.bigint(), fields.get(1).type());
    assertEquals(CqlField.CqlType.InternalType.Udt, fields.get(2).type().internalType());
    assertEquals(bridge.text(), fields.get(3).type());
    final CqlField.CqlUdt udtField = (CqlField.CqlUdt) fields.get(2).type();
    assertEquals(bridge.timestamp(), udtField.field(0).type());
    assertEquals(bridge.text(), udtField.field(1).type());
    assertEquals(bridge.aFloat(), udtField.field(2).type());
    assertEquals(bridge.aInt(), udtField.field(3).type());
}
Also used : ReplicationFactor(org.apache.cassandra.spark.data.ReplicationFactor) CqlField(org.apache.cassandra.spark.data.CqlField) CqlSchema(org.apache.cassandra.spark.data.CqlSchema) Test(org.junit.Test)

Example 5 with ReplicationFactor

use of org.apache.cassandra.spark.data.ReplicationFactor in project spark-cassandra-bulkreader by jberragan.

the class SchemaBuilderTests method testCollections.

@Test
public void testCollections() {
    final String create_stmt = "CREATE TABLE backup_test.collection_test (account_id uuid PRIMARY KEY, balance bigint, names set<text>);";
    final ReplicationFactor rf = new ReplicationFactor(ReplicationFactor.ReplicationStrategy.NetworkTopologyStrategy, ImmutableMap.of("DC1", 3, "DC2", 3));
    final CqlSchema schema = new FourZeroSchemaBuilder(create_stmt, "backup_test", rf).build();
    assertEquals(schema.getField("names").type().internalType(), CqlField.CqlType.InternalType.Set);
}
Also used : ReplicationFactor(org.apache.cassandra.spark.data.ReplicationFactor) CqlSchema(org.apache.cassandra.spark.data.CqlSchema) Test(org.junit.Test)

Aggregations

ReplicationFactor (org.apache.cassandra.spark.data.ReplicationFactor)30 Test (org.junit.Test)28 CqlSchema (org.apache.cassandra.spark.data.CqlSchema)11 CqlField (org.apache.cassandra.spark.data.CqlField)8 Range (com.google.common.collect.Range)7 BigInteger (java.math.BigInteger)6 Path (java.nio.file.Path)6 ArrayList (java.util.ArrayList)5 List (java.util.List)5 TestDataLayer (org.apache.cassandra.spark.TestDataLayer)5 TestSchema (org.apache.cassandra.spark.TestSchema)5 TestUtils.runTest (org.apache.cassandra.spark.TestUtils.runTest)5 TableMetadata (org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata)5 ImmutableMap (com.google.common.collect.ImmutableMap)3 ByteBuffer (java.nio.ByteBuffer)3 Map (java.util.Map)3 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)3 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)3 Collectors (java.util.stream.Collectors)3 CustomFilter (org.apache.cassandra.spark.sparksql.filters.CustomFilter)3