org.kitesdk.data.PartitionStrategy Java Examples
The following examples show how to use
org.kitesdk.data.PartitionStrategy.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TimeDomain.java From kite with Apache License 2.0 | 6 votes |
public TimeDomain(PartitionStrategy strategy, String sourceName) { Map<Integer, CalendarFieldPartitioner> mapping = Maps.newHashMap(); for (FieldPartitioner fp : Accessor.getDefault().getFieldPartitioners(strategy)) { // there may be partitioners for more than one source field if (sourceName.equals(fp.getSourceName()) && fp instanceof CalendarFieldPartitioner) { mapping.put( ((CalendarFieldPartitioner) fp).getCalendarField(), (CalendarFieldPartitioner) fp); } } // get the partitioners to check for this strategy this.partitioners = Lists.newArrayList(); for (int field : order) { // if there is no partition for the next field, then all are included // example: yyyy/mm/dd partitioning accepts when field is hour if (mapping.containsKey(field)) { partitioners.add(mapping.get(field)); } else if (!partitioners.isEmpty()) { break; } } }
Example #2
Source File: TestCreatePartitionStrategyCommand.java From kite with Apache License 2.0 | 6 votes |
@Test public void testTime() throws Exception { command.partitions = Lists.newArrayList( "created_at:year", "created_at:month", "created_at:day", "created_at:hour", "created_at:minute" ); command.run(); PartitionStrategy strategy = new PartitionStrategy.Builder() .year("created_at") .month("created_at") .day("created_at") .hour("created_at") .minute("created_at") .build(); verify(console).info(strategy.toString(true)); verifyNoMoreInteractions(console); }
Example #3
Source File: TestCompatibilityChecks.java From kite with Apache License 2.0 | 6 votes |
@Test public void testUpdateNonProvided() { final PartitionStrategy provided = new PartitionStrategy.Builder() .identity("s", "part") .build(); TestHelpers.assertThrows("Should not allow replacing if not provided", ValidationException.class, new Runnable() { @Override public void run() { Compatibility.checkStrategyUpdate( provided, new PartitionStrategy.Builder() .dateFormat("l", "part", "yyyy-MM-dd") .build(), PROVIDED_TEST_SCHEMA); } }); }
Example #4
Source File: PartitionKey.java From kite with Apache License 2.0 | 6 votes |
/** * <p> * Construct a partition key for the given entity, reusing the supplied key if * not null. * </p> * <p> * This is a convenient way to find the partition that a given entity is * written to, or to find a partition using objects from the entity domain. * </p> */ @SuppressWarnings("unchecked") public static <E> PartitionKey partitionKeyForEntity(PartitionStrategy strategy, E entity, EntityAccessor<E> accessor, @Nullable PartitionKey reuseKey) { List<FieldPartitioner> fieldPartitioners = Accessor.getDefault().getFieldPartitioners(strategy); PartitionKey key = (reuseKey == null ? new PartitionKey(new Object[fieldPartitioners.size()]) : reuseKey); for (int i = 0; i < fieldPartitioners.size(); i++) { FieldPartitioner fp = fieldPartitioners.get(i); key.set(i, fp.apply(accessor.get(entity, fp.getSourceName()))); } return key; }
Example #5
Source File: TestTableConversion.java From kite with Apache License 2.0 | 6 votes |
@Test public void testConvertTableWithRequiredFields() { Schema recordSchema = Schema.createRecord("inner", null, null, false); recordSchema.setFields(Lists.newArrayList( new Schema.Field("a", Schema.create(Schema.Type.INT), null, null), new Schema.Field("b", optional(Schema.create(Schema.Type.BYTES)), null, NULL_DEFAULT) )); Schema structOfStructsSchema = Schema.createRecord("test", null, null, false); structOfStructsSchema.setFields(Lists.newArrayList( new Schema.Field("str", Schema.create(Schema.Type.STRING), null, null), new Schema.Field("inner", recordSchema, null, null) )); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("not_present", "int") .hash("inner.a", 16) // requires both inner and inner.a .identity("str") .build(); Assert.assertEquals("Should convert table named test", structOfStructsSchema, HiveSchemaConverter.convertTable("test", TABLE, strategy)); }
Example #6
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testPartitionedSourceAndTarget() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Dataset<Record> outputPart0 = ((PartitionedDataset<Record>) outputDataset).getPartition(key, true); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputPart0)); }
Example #7
Source File: TestDatasetWriterCacheLoader.java From kite with Apache License 2.0 | 6 votes |
@Before public void setUp() throws IOException { this.conf = new Configuration(); this.fileSystem = FileSystem.get(conf); this.testDirectory = new Path(Files.createTempDir().getAbsolutePath()); this.repo = new FileSystemDatasetRepository(conf, testDirectory, new EnusrePartitionPathDoesNotExistMetadataProvider(conf, testDirectory)); partitionStrategy = new PartitionStrategy.Builder() .hash("username", 2).build(); FileSystemDataset<Object> users = (FileSystemDataset<Object>) repo.create( "ns", "users", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); view = new FileSystemView<Object>(users, null, null, Object.class); }
Example #8
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSourceView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #9
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testTargetView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); View<Record> outputView = outputDataset.with("username", "test-0"); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #10
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testTargetViewProvidedPartition() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().provided("version").build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); View<Record> inputView = inputDataset.with("version", "test-version-0"); writeTestUsers(inputView, 1); Assert.assertEquals(1, datasetSize(inputView)); View<Record> outputView = outputDataset.with("version", "test-version-0"); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #11
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testDatasetUris() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(new URIBuilder(repo.getUri(), "ns", "in").build(), GenericData.Record.class)); pipeline.write(data, CrunchDatasets.asTarget( new URIBuilder(repo.getUri(), "ns", "out").build()), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(10, datasetSize(outputDataset)); }
Example #12
Source File: TestPartitionStrategyParser.java From kite with Apache License 2.0 | 6 votes |
@Test public void testIdentity() { // right now, the field type is taken from the Schema checkParser(new PartitionStrategy.Builder() .identity("username", "id") .build(), "[ {\"type\": \"identity\", " + "\"source\": \"username\", " + "\"name\": \"id\"} ]" ); checkParser(new PartitionStrategy.Builder() .identity("username", "username_copy") .build(), "[ {\"type\": \"identity\", \"source\": \"username\"} ]" ); }
Example #13
Source File: FileSystemUtil.java From kite with Apache License 2.0 | 6 votes |
public static PartitionStrategy strategy(FileSystem fs, Path location) throws IOException { if (!fs.exists(location)) { return null; } List<Pair<String, Class<? extends Comparable>>> pairs = visit( new GetPartitionInfo(), fs, location); if (pairs == null || pairs.isEmpty() || pairs.size() <= 1) { return null; } PartitionStrategy.Builder builder = new PartitionStrategy.Builder(); // skip the initial partition because it is the containing directory for (int i = 1; i < pairs.size(); i += 1) { Pair<String, Class<? extends Comparable>> pair = pairs.get(i); builder.provided( pair.first() == null ? "partition_" + i : pair.first(), ProvidedFieldPartitioner.valuesString(pair.second())); } return builder.build(); }
Example #14
Source File: TestPartitionStrategyParser.java From kite with Apache License 2.0 | 6 votes |
@Test public void testAddEmbeddedPartitionStrategy() { PartitionStrategy strategy = new PartitionStrategy.Builder() .hash("username", 16) .identity("username", "u") .build(); Schema original = new Schema.Parser().parse("{" + " \"type\": \"record\"," + " \"name\": \"User\"," + " \"fields\": [" + " {\"name\": \"id\", \"type\": \"long\"}," + " {\"name\": \"username\", \"type\": \"string\"}," + " {\"name\": \"real_name\", \"type\": \"string\"}" + " ]" + "}"); Schema embedded = PartitionStrategyParser.embedPartitionStrategy(original, strategy); Assert.assertTrue(PartitionStrategyParser.hasEmbeddedStrategy(embedded)); Assert.assertEquals(strategy, PartitionStrategyParser.parseFromSchema(embedded)); }
Example #15
Source File: AvroKeySerDe.java From kite with Apache License 2.0 | 6 votes |
public AvroKeySerDe(Schema schema, PartitionStrategy partitionStrategy) { this.schema = schema; int fieldSize = schema.getFields().size(); partialSchemas = new Schema[fieldSize]; for (int i = 0; i < fieldSize; i++) { if (i == (fieldSize - 1)) { break; } List<Field> partialFieldList = new ArrayList<Field>(); for (Field field : schema.getFields().subList(0, i + 1)) { partialFieldList.add(AvroUtils.cloneField(field)); } partialSchemas[i] = Schema.createRecord(partialFieldList); } this.partitionStrategy = partitionStrategy; }
Example #16
Source File: EntityAccessor.java From kite with Apache License 2.0 | 5 votes |
public StorageKey keyFor(E object, @Nullable Map<String, Object> provided, StorageKey reuse) { Preconditions.checkNotNull(reuse, "Cannot use null key"); PartitionStrategy strategy = reuse.getPartitionStrategy(); List<FieldPartitioner> partitioners = Accessor.getDefault().getFieldPartitioners(strategy); for (int i = 0, n = partitioners.size(); i < n; i += 1) { reuse.replace(i, partitionValue(object, provided, partitioners.get(i))); } return reuse; }
Example #17
Source File: TestCompatibilityChecks.java From kite with Apache License 2.0 | 5 votes |
@Test public void testIllegalPartitionNames() { // no need to check sources because '.' and '-' aren't allowed in schemas TestHelpers.assertThrows("Should reject '-' in partition name", ValidationException.class, new Runnable() { @Override public void run() { Compatibility.checkDescriptor( new DatasetDescriptor.Builder() .schema(schema) .partitionStrategy(new PartitionStrategy.Builder() .identity("day_of_month", "day-of-month") .build()) .build()); } }); TestHelpers.assertThrows("Should reject '.' in partition name", ValidationException.class, new Runnable() { @Override public void run() { Compatibility.checkDescriptor( new DatasetDescriptor.Builder() .schema(schema) .partitionStrategy(new PartitionStrategy.Builder() .identity("number", "day.of.month") .build()) .build()); } }); }
Example #18
Source File: TestPartitionStrategyParser.java From kite with Apache License 2.0 | 5 votes |
@Test public void testHash() { checkParser(new PartitionStrategy.Builder().hash("id", 64).build(), "[ {\"type\": \"hash\", \"source\": \"id\", \"buckets\": 64} ]"); checkParser(new PartitionStrategy.Builder().hash("id", "h", 64).build(), "[ {\"type\": \"hash\", " + "\"source\": \"id\", " + "\"name\": \"h\", " + "\"buckets\": 64} ]" ); TestHelpers.assertThrows("Should reject missing buckets", ValidationException.class, new Runnable() { @Override public void run() { PartitionStrategyParser.parse("[ {\"type\": \"hash\", " + "\"source\": \"id\", " + "\"name\": \"h\"} ]"); } } ); TestHelpers.assertThrows("Should reject invalid buckets", ValidationException.class, new Runnable() { @Override public void run() { PartitionStrategyParser.parse("[ {\"type\": \"hash\", " + "\"source\": \"id\", " + "\"name\": \"h\", " + "\"buckets\": \"green\"} ]"); } } ); }
Example #19
Source File: PartitionStrategyParser.java From kite with Apache License 2.0 | 5 votes |
public static Schema embedPartitionStrategy(Schema schema, PartitionStrategy strategy) { // TODO: avoid embedding strategies in the schema // Avro considers Props read-only and uses an older Jackson version // Parse the Schema as a String because Avro uses com.codehaus.jackson ObjectNode schemaJson = JsonUtil.parse(schema.toString(), ObjectNode.class); schemaJson.set(PARTITIONS, toJson(strategy)); return new Schema.Parser().parse(schemaJson.toString()); }
Example #20
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 5 votes |
@Test public void testMultipleParquetFilesInSeparateFolders() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createParquetEventFile(fs, new Path(parent, "part")); createParquetEventFile(fs, new Path(parent, "2")); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("partition_1", "string") .build(); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
Example #21
Source File: TestPathConversion.java From kite with Apache License 2.0 | 5 votes |
@Test @SuppressWarnings("unchecked") public void testToKey() { PartitionStrategy strategy = new PartitionStrategy.Builder() .year("timestamp") .month("timestamp") .day("timestamp") .build(); StorageKey expected = new StorageKey(strategy); expected.replaceValues((List) Lists.newArrayList(2013, 11, 5)); Assert.assertEquals(expected, convert.toKey( new Path("year=2013/month=11/day=5"), new StorageKey(strategy))); }
Example #22
Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0 | 5 votes |
@Test public void testUpdateFailsWithPartitionStrategyChange() { PartitionStrategy ps1 = new PartitionStrategy.Builder() .hash("username", 2) .build(); PartitionStrategy ps2 = new PartitionStrategy.Builder() .hash("username", 2) .hash("email", 3) .build(); Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder(testDescriptor) .partitionStrategy(ps1) .build()); DatasetDescriptor changed = new DatasetDescriptor.Builder(dataset.getDescriptor()) .partitionStrategy(ps2) .build(); try { repo.update(NAMESPACE, NAME, changed); Assert.fail("Should fail due to partition strategy change"); } catch (ValidationException e) { // expected } Assert.assertEquals( ps1, repo.load(NAMESPACE, NAME).getDescriptor().getPartitionStrategy()); }
Example #23
Source File: TestSimpleView.java From kite with Apache License 2.0 | 5 votes |
@Test public void testRefineIdentity() throws Exception { PartitionStrategy strategy = new PartitionStrategy.Builder() .identity("user_id") .build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:standard_event.avsc") .partitionStrategy(strategy) .build(); // Create a separate dataset to avoid conflicts with the above. Dataset<StandardEvent> identityDataset = repo.create( "ns", "test_identity", descriptor); DatasetWriter<StandardEvent> writer = null; try { writer = identityDataset.newWriter(); writer.write(sepEvent); writer.write(octEvent); writer.write(novEvent); } finally { Closeables.close(writer, false); } assertContentEquals(Sets.newHashSet(sepEvent, novEvent), identityDataset.with("user_id", 0L)); }
Example #24
Source File: Constraints.java From kite with Apache License 2.0 | 5 votes |
private Constraints(Schema schema, PartitionStrategy strategy, Map<String, Predicate> constraints, Map<String, Object> provided) { this.schema = schema; this.strategy = strategy; this.constraints = constraints; this.provided = provided; }
Example #25
Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0 | 5 votes |
@Override public AvroKeySchema parseKeySchema(String rawSchema, PartitionStrategy partitionStrategy) { // use DatasetDescriptor.Builder because it checks consistency DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(rawSchema) .partitionStrategy(partitionStrategy) .build(); return new AvroKeySchema( descriptor.getSchema(), descriptor.getPartitionStrategy()); }
Example #26
Source File: TestDateFormatPartitioner.java From kite with Apache License 2.0 | 5 votes |
@Test public void testExpressionRoundTrip() { PartitionStrategy strategy = new PartitionStrategy.Builder() .dateFormat("timestamp", "day", "yyyy-MM-dd") .build(); PartitionStrategy copy = Accessor.getDefault().fromExpression( Accessor.getDefault().toExpression(strategy)); Assert.assertEquals(strategy, copy); }
Example #27
Source File: TestMetadataProviders.java From kite with Apache License 2.0 | 5 votes |
@Before public void setUp() throws IOException, URISyntaxException { this.conf = (distributed ? MiniDFSTest.getConfiguration() : new Configuration()); this.testDescriptor = new DatasetDescriptor.Builder() .format(Formats.AVRO) .schema(SchemaBuilder.record("Event").fields() .requiredLong("timestamp") .requiredString("message") .endRecord()) .partitionStrategy(new PartitionStrategy.Builder() .year("timestamp") .month("timestamp") .day("timestamp") .build()) .build(); // something completely different this.anotherDescriptor = new DatasetDescriptor.Builder() .format(Formats.PARQUET) .schema(SchemaBuilder.record("Record").fields() .requiredBytes("some_field") .requiredString("another_field") .endRecord()) .partitionStrategy(new PartitionStrategy.Builder() .hash("some_field", 20000) .build()) .build(); this.provider = newProvider(conf); }
Example #28
Source File: StorageKey.java From kite with Apache License 2.0 | 5 votes |
private StorageKey(PartitionStrategy strategy, List<Object> values) { try { this.fields = FIELD_CACHE.get(strategy); } catch (ExecutionException ex) { throw new RuntimeException("[BUG] Could not get field map"); } Preconditions.checkArgument(values.size() == fields.size(), "Not enough values for a complete StorageKey"); this.strategy = strategy; this.values = values; this.path = null; }
Example #29
Source File: TestPartitionStrategyParser.java From kite with Apache License 2.0 | 5 votes |
@Test public void testMinute() { checkParser(new PartitionStrategy.Builder().minute("time").build(), "[ {\"type\": \"minute\", \"source\": \"time\"} ]"); checkParser(new PartitionStrategy.Builder().minute("time", "m").build(), "[ {\"type\": \"minute\", \"source\": \"time\", \"name\": \"m\"} ]"); }
Example #30
Source File: Constraints.java From kite with Apache License 2.0 | 5 votes |
private static <E> Predicate<E> entityPredicate( Map<String, Predicate> predicates, Schema schema, EntityAccessor<E> accessor, PartitionStrategy strategy) { if (Schema.Type.RECORD != schema.getType()) { return alwaysTrue(); } return new EntityPredicate<E>(predicates, schema, accessor, strategy); }