org.kitesdk.data.View Java Examples
The following examples show how to use
org.kitesdk.data.View.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
private GetStorageKey(View<E> view, int numPartitionWriters) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); // get serializable versions of transient objects this.strategyString = descriptor.getPartitionStrategy() .toString(false /* no white space */); this.schemaString = descriptor.getSchema() .toString(false /* no white space */); this.type = view.getType(); if (view instanceof AbstractRefinableView) { this.constraints = ((AbstractRefinableView) view).getConstraints() .toQueryMap(); } else { this.constraints = null; } this.numPartitionWriters = numPartitionWriters > 0 ? numPartitionWriters : 1; }
Example #2
Source File: CreateEvents.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(List<String> args) throws Exception { Preconditions.checkState(!Datasets.exists(uri), "events dataset already exists"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(StandardEvent.class).build(); View<StandardEvent> events = Datasets.create(uri, descriptor, StandardEvent.class); DatasetWriter<StandardEvent> writer = events.newWriter(); try { while (System.currentTimeMillis() - baseTimestamp < 36000) { writer.write(generateRandomEvent()); } } finally { writer.close(); } System.out.println("Generated " + counter + " events"); return 0; }
Example #3
Source File: TestFileSystemDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testViewForMissingPartitionNames() { Path path = new Path("2014/3/14"); // like PathConversion, this uses names from the partition strategy // and will accept partitions that don't have a "name=" component View<GenericRecord> view = FileSystemDatasets.viewForUri( dataset, path.toString()); Assert.assertEquals("Should create correct view", view, dataset.getPartitionView(path)); Constraints expected = ((AbstractRefinableView<GenericRecord>) dataset.with("y", 2014).with("m", 3).with("d", 14)).getConstraints(); Constraints actual = ((AbstractRefinableView<GenericRecord>) view) .getConstraints(); Assert.assertEquals("Constraints should match expected", expected, actual); }
Example #4
Source File: DatasetTestUtilities.java From kite with Apache License 2.0 | 6 votes |
public static void writeTestUsers(View<GenericData.Record> view, int count, int start, String... fields) { DatasetWriter<GenericData.Record> writer = null; try { writer = view.newWriter(); for (int i = start; i < count + start; i++) { GenericRecordBuilder recordBuilder = new GenericRecordBuilder(view.getDataset().getDescriptor ().getSchema()).set("username", "test-" + i); for (String field : fields) { recordBuilder.set(field, field + "-" + i); } writer.write(recordBuilder.build()); } if (writer instanceof Flushable) { ((Flushable) writer).flush(); } } finally { if (writer != null) { writer.close(); } } }
Example #5
Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSourceView() throws IOException { String datasetName = tableName + ".TestGenericEntity"; DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(testGenericEntity) .build(); Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor); Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor); writeRecords(inputDataset, 10); View<GenericRecord> inputView = inputDataset .from("part1", new Utf8("part1_2")).to("part1", new Utf8("part1_7")) .from("part2", new Utf8("part2_2")).to("part2", new Utf8("part2_7")); Assert.assertEquals(6, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf()); PCollection<GenericRecord> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkRecords(outputDataset, 6, 2); }
Example #6
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSourceView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #7
Source File: TestFileSystemDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testViewForIncompleteUri() { Path path = new Path("/tmp/datasets/ns/test/y=2014/m=03"); View<GenericRecord> view = FileSystemDatasets.viewForUri( dataset, path.toString()); Assert.assertEquals("Should create correct view", view, dataset.getPartitionView(path)); Constraints expected = ((AbstractRefinableView<GenericRecord>) dataset.with("y", 2014).with("m", 3)).getConstraints(); Constraints actual = ((AbstractRefinableView<GenericRecord>) view) .getConstraints(); Assert.assertEquals("Constraints should match expected", expected, actual); }
Example #8
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testWriteModeOverwrite() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); writeTestUsers(outputDataset, 1, 1); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset), Target.WriteMode.OVERWRITE); pipeline.run(); checkTestUsers(outputDataset, 1); }
Example #9
Source File: TestFileSystemDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testViewForDifferentPartitionNames() { Path path = new Path("year=2014/month=3/day=14"); // like PathConversion, this uses names from the partition strategy // and will accept partitions that have a different "name=" component View<GenericRecord> view = FileSystemDatasets.viewForUri( dataset, path.toString()); Assert.assertEquals("Should create correct view", view, dataset.getPartitionView(path)); Constraints expected = ((AbstractRefinableView<GenericRecord>) dataset.with("y", 2014).with("m", 3).with("d", 14)).getConstraints(); Constraints actual = ((AbstractRefinableView<GenericRecord>) view) .getConstraints(); Assert.assertEquals("Constraints should match expected", expected, actual); }
Example #10
Source File: TestFileSystemDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testViewForUri() { Path path = new Path("/tmp/datasets/ns/test/y=2014/m=03/d=14"); View<GenericRecord> view = FileSystemDatasets.viewForUri( dataset, "file:" + path); Assert.assertEquals("Should create correct view", view, dataset.getPartitionView(path)); view = FileSystemDatasets.viewForUri( dataset, path.toString()); Assert.assertEquals("Should create correct view", view, dataset.getPartitionView(path)); Constraints expected = ((AbstractRefinableView<GenericRecord>) dataset.with("y", 2014).with("m", 3).with("d", 14)).getConstraints(); Constraints actual = ((AbstractRefinableView<GenericRecord>) view) .getConstraints(); Assert.assertEquals("Constraints should match expected", expected, actual); }
Example #11
Source File: DatasetKeyInputFormat.java From kite with Apache License 2.0 | 6 votes |
/** * Adds configuration for {@code DatasetKeyInputFormat} to read from the * given {@link Dataset} or {@link View} instance. * * @param view a dataset or view * @return this for method chaining */ public ConfigBuilder readFrom(View<?> view) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); // if this is a partitioned dataset, add the partition location if (view instanceof FileSystemDataset) { conf.set(KITE_PARTITION_DIR, String.valueOf(descriptor.getLocation())); } // add descriptor properties to the config for (String property : descriptor.listProperties()) { conf.set(property, descriptor.getProperty(property)); } if (DataModelUtil.isGeneric(view.getType())) { Schema datasetSchema = view.getDataset().getDescriptor().getSchema(); // only set the read schema if the view is a projection if (!datasetSchema.equals(view.getSchema())) { withSchema(view.getSchema()); } } else { withType(view.getType()); } conf.set(KITE_INPUT_URI, view.getUri().toString()); return this; }
Example #12
Source File: TestMapReduce.java From kite with Apache License 2.0 | 6 votes |
@Test @SuppressWarnings("deprecation") public void testSignalReadyOutputView() throws Exception { Assume.assumeTrue(!Hadoop.isHadoop1()); populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); View<Record> outputView = outputDataset.with("name", "apple", "banana", "carrot"); DatasetKeyOutputFormat.configure(job).appendTo(outputView).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); Assert.assertFalse("Output dataset should not be signaled ready", ((Signalable)outputDataset).isReady()); Assert.assertTrue("Output view should be signaled ready", ((Signalable)outputView).isReady()); }
Example #13
Source File: TestFileSystemDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testIgnoresAuthority() { Path path = new Path("/tmp/datasets/ns/test/y=2014/m=03/d=14"); View<GenericRecord> view = FileSystemDatasets.viewForUri( dataset, "file://127.0.0.1/tmp/datasets/ns/test/y=2014/m=03/d=14"); Assert.assertEquals("Should create correct view", view, dataset.getPartitionView(path)); Constraints expected = ((AbstractRefinableView<GenericRecord>) dataset.with("y", 2014).with("m", 3).with("d", 14)).getConstraints(); Constraints actual = ((AbstractRefinableView<GenericRecord>) view) .getConstraints(); Assert.assertEquals("Constraints should match expected", expected, actual); }
Example #14
Source File: TestViewUris.java From kite with Apache License 2.0 | 6 votes |
@Test public void testURIStringEquality() { for(int i = 0; i < 10; i++) { String a = UUID.randomUUID().toString(); String b = UUID.randomUUID().toString(); String originalUri = "view:file:/tmp/test_name?color="+ a + "," + b; View<GenericRecord> view = Datasets.load(originalUri); String afterUri = view.getUri().toString(); if(!originalUri.equals(afterUri)) { System.out.println("Iteration: " + i); System.out.println("Start: " + originalUri); System.out.println("End : " + afterUri); } Assert.assertEquals(originalUri, afterUri); } }
Example #15
Source File: FileSystemDataset.java From kite with Apache License 2.0 | 5 votes |
@Override public boolean canReplace(View<E> part) { if (part instanceof FileSystemView) { return equals(part.getDataset()) && ((FileSystemView) part).getConstraints().alignedWithBoundaries(); } else if (part instanceof FileSystemDataset) { return equals(part); } return false; }
Example #16
Source File: FileSystemDataset.java From kite with Apache License 2.0 | 5 votes |
View<E> viewForUri(URI location) { Preconditions.checkNotNull(location, "Partition location cannot be null"); PartitionView<E> view = getPartitionView(location); if (view == unbounded) { return this; } return view; }
Example #17
Source File: TestProjection.java From kite with Apache License 2.0 | 5 votes |
@Test public void testSpecificProjectionAsType() throws IOException { Dataset<GenericRecord> original = Datasets.load(unbounded.getUri()); DatasetWriter<StandardEvent> writer = null; try { writer = original.asType(StandardEvent.class).newWriter(); writer.write(sepEvent); writer.write(octEvent); writer.write(novEvent); } finally { Closeables.close(writer, false); } final View<SmallEvent> smallEvents = original.asType(SmallEvent.class); Set<SmallEvent> expected = Sets.newHashSet(toSmallEvent(sepEvent), toSmallEvent(octEvent), toSmallEvent(novEvent)); assertContentEquals(expected, smallEvents); TestHelpers.assertThrows("Should not be able to write small events", IncompatibleSchemaException.class, new Runnable() { @Override public void run() { smallEvents.newWriter(); } }); }
Example #18
Source File: DaoViewTest.java From kite with Apache License 2.0 | 5 votes |
private void validRange(View<TestEntity> range, int startIdx, int endIdx) { int cnt = startIdx; DatasetReader<TestEntity> reader = range.newReader(); try { for (TestEntity entity : reader) { Assert.assertEquals(Integer.toString(cnt), entity.getPart1()); Assert.assertEquals(Integer.toString(cnt), entity.getPart2()); cnt++; } } finally { reader.close(); } Assert.assertEquals(endIdx, cnt); }
Example #19
Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") private static <E> View<E> load(JobContext jobContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); Class<E> type = getType(jobContext); String outputUri = conf.get(KITE_OUTPUT_URI); return Datasets.<E, View<E>>load(outputUri, type); }
Example #20
Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0 | 5 votes |
@Override public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext); DefaultConfiguration.init(conf); View<E> view = load(taskAttemptContext); return usePerTaskAttemptDatasets(view, conf) ? new MergeOutputCommitter<E>() : new NullOutputCommitter(); }
Example #21
Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0 | 5 votes |
@Override public void checkOutputSpecs(JobContext jobContext) { // The committer setup will fail if the output dataset does not exist View<E> target = load(jobContext); Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); switch (conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT)) { case APPEND: break; case OVERWRITE: // if the merge won't use replace, then delete the existing data if (!canReplace(target)) { target.deleteAll(); } break; default: case DEFAULT: boolean isReady = false; if (target instanceof Signalable) { isReady = ((Signalable)target).isReady(); } if (isReady || !target.isEmpty()) { throw new DatasetException( "View is not empty or has been signaled as ready: " + target); } break; } }
Example #22
Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("unchecked") public RecordWriter<E, Void> getRecordWriter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext .getConfiguration.invoke(taskAttemptContext); View<E> target = load(taskAttemptContext); View<E> working; if (usePerTaskAttemptDatasets(target, conf)) { working = loadOrCreateTaskAttemptView(taskAttemptContext); } else { working = target; } boolean copyRecords = conf.getBoolean(KITE_COPY_RECORDS, false); String partitionDir = conf.get(KITE_PARTITION_DIR); if (working.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { if (!(target instanceof FileSystemDataset)) { throw new UnsupportedOperationException("Partitions only supported for " + "FileSystemDataset. Dataset: " + target); } FileSystemDataset fsDataset = (FileSystemDataset) target; PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir)); if (key != null && !key.getValues().isEmpty()) { working = fsDataset.getPartition(key, true); } return new DatasetRecordWriter<E>(working, copyRecords); } else { return new DatasetRecordWriter<E>(working, copyRecords); } }
Example #23
Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("unchecked") public void commitJob(JobContext jobContext) throws IOException { Configuration conf = Hadoop.JobContext .getConfiguration.invoke(jobContext); DatasetRepository repo = getDatasetRepository(jobContext); boolean isTemp = repo instanceof TemporaryDatasetRepository; String jobDatasetName = getJobDatasetName(jobContext); View<E> targetView = load(jobContext); Dataset<E> jobDataset = repo.load(TEMP_NAMESPACE, jobDatasetName); WriteMode mode = conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT); if (mode == WriteMode.OVERWRITE && canReplace(targetView)) { ((Replaceable<View<E>>) targetView.getDataset()).replace(targetView, jobDataset); } else { ((Mergeable<Dataset<E>>) targetView.getDataset()).merge(jobDataset); } if (targetView instanceof Signalable) { ((Signalable)targetView).signalReady(); } if (isTemp) { ((TemporaryDatasetRepository) repo).delete(); } else { repo.delete(TEMP_NAMESPACE, jobDatasetName); } }
Example #24
Source File: DatasetSink.java From kite with Apache License 2.0 | 5 votes |
private DatasetWriter<GenericRecord> newWriter( final UserGroupInformation login, final URI uri) { View<GenericRecord> view = KerberosUtil.runPrivileged(login, new PrivilegedExceptionAction<Dataset<GenericRecord>>() { @Override public Dataset<GenericRecord> run() { return Datasets.load(uri); } }); DatasetDescriptor descriptor = view.getDataset().getDescriptor(); String formatName = descriptor.getFormat().getName(); Preconditions.checkArgument(allowedFormats().contains(formatName), "Unsupported format: " + formatName); Schema newSchema = descriptor.getSchema(); if (targetSchema == null || !newSchema.equals(targetSchema)) { this.targetSchema = descriptor.getSchema(); // target dataset schema has changed, invalidate all readers based on it readers.invalidateAll(); } this.reuseDatum = !("parquet".equals(formatName)); this.datasetName = view.getDataset().getName(); return view.newWriter(); }
Example #25
Source File: TestProjection.java From kite with Apache License 2.0 | 5 votes |
@Test public void testReflectProjectionLoad() throws IOException { Dataset<ReflectStandardEvent> original = repo.create( "ns", "reflectProjection", new DatasetDescriptor.Builder() .schema(ReflectStandardEvent.class) .build(), ReflectStandardEvent.class); DatasetWriter<ReflectStandardEvent> writer = null; try { writer = original.newWriter(); writer.write(new ReflectStandardEvent(sepEvent)); writer.write(new ReflectStandardEvent(octEvent)); writer.write(new ReflectStandardEvent(novEvent)); } finally { Closeables.close(writer, false); } View<ReflectSmallEvent> dataset = repo.load("ns", original.getName(), ReflectSmallEvent.class); Set<ReflectSmallEvent> expected = Sets.newHashSet( new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent), new ReflectSmallEvent(novEvent)); assertContentEquals(expected, dataset); }
Example #26
Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0 | 5 votes |
/** * Adds configuration for {@code DatasetKeyOutputFormat} to write to the * given {@link Dataset} or {@link View} instance. * * @param view a dataset or view * @return this for method chaining */ public ConfigBuilder writeTo(View<?> view) { if (view instanceof FileSystemDataset) { FileSystemDataset dataset = (FileSystemDataset) view; conf.set(KITE_PARTITION_DIR, String.valueOf(dataset.getDescriptor().getLocation())); } withType(view.getType()); return writeTo(view.getUri()); }
Example #27
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
private void runCheckpointPipeline(View<Record> inputView, View<Record> outputView) { Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.CHECKPOINT); pipeline.done(); }
Example #28
Source File: CompactionTask.java From kite with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") private void checkCompactable(View<T> view) { Dataset<T> dataset = view.getDataset(); if (!(dataset instanceof Replaceable)) { throw new IllegalArgumentException("Cannot compact dataset: " + dataset); } Replaceable<View<T>> replaceable = ((Replaceable<View<T>>) dataset); Preconditions.checkArgument(replaceable.canReplace(view), "Cannot compact view: " + view); }
Example #29
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
@Test(expected = CrunchRuntimeException.class) public void testWriteModeDefaultFailsWithExisting() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); writeTestUsers(outputDataset, 1, 0); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset)); }
Example #30
Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") private static boolean canReplace(View<?> view) { if (Hadoop.isHadoop1()) { // can't use replace because it is called in the OutputCommitter. return false; } Dataset<?> dataset = view.getDataset(); return (dataset instanceof Replaceable && ((Replaceable<View<?>>) dataset).canReplace(view)); }