Example 1
Source File: From kite with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset); writeUserToView(dataset); DatasetDescriptor expected = dataset.getDescriptor(); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertEquals("Should succeed and find an equivalent descriptor", expected, actual); }
Example 2
Source File: From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(List<String> args) throws Exception { Preconditions.checkState(!Datasets.exists(uri), "events dataset already exists"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(StandardEvent.class).build(); View<StandardEvent> events = Datasets.create(uri, descriptor, StandardEvent.class); DatasetWriter<StandardEvent> writer = events.newWriter(); try { while (System.currentTimeMillis() - baseTimestamp < 36000) { writer.write(generateRandomEvent()); } } finally { writer.close(); } System.out.println("Generated " + counter + " events"); return 0; }
Example 3
Source File: From kite with Apache License 2.0 | 6 votes |
@Test public void testDatasetNotPartitioned() { Datasets.delete("dataset:file:/tmp/datasets/ns/test"); final Dataset<GenericRecord> ds = Datasets.create( "dataset:file:/tmp/datasets/ns/test", new DatasetDescriptor.Builder() .schema(schema) .build()); Assert.assertEquals("Should work for empty relative directory", ds, FileSystemDatasets.viewForUri(ds, "file:/tmp/datasets/ns/test")); TestHelpers.assertThrows("Should reject paths in a non-partitioned dataset", IllegalArgumentException.class, new Runnable() { @Override public void run() { FileSystemDatasets.viewForUri(ds, "y=2014/m=03/d=14"); } }); }
Example 4
Source File: From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(List<String> args) throws Exception { String inputUri = uri; String outputUri = "dataset:hive?dataset=correlated_events"; if (args.size() == 1) { outputUri = args.get(0); } Preconditions.checkState(Datasets.exists(inputUri), "input dataset doesn't exists"); if (!Datasets.exists(outputUri)) { Datasets.create(outputUri, new DatasetDescriptor.Builder() .format("avro") .schema(CorrelatedEvents.class) .build()); } CorrelateEventsTask task = new CorrelateEventsTask(inputUri, outputUri);; return 0; }
Example 5
Source File: From kite with Apache License 2.0 | 6 votes |
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(TestRecord.class) .build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); }
Example 6
Source File: From kite with Apache License 2.0 | 5 votes |
@Test public void testCreateChangedAbsolutePathURIMissingNamespace() { // this used to be a relative external URI, but is now a managed URI String uri = "dataset:hive:/ds"; Datasets.create(uri, DESCRIPTOR); Table table = metastore.getTable("default", "ds"); Assert.assertNotNull("Table should be found under default.ds", table); Assert.assertTrue("Should create a managed table: " + table.getSd().getLocation(), HiveAbstractMetadataProvider.isManaged(table)); Assert.assertTrue(Datasets.delete(uri)); }
Example 7
Source File: From kite with Apache License 2.0 | 5 votes |
@Before public void createDatasets() throws Exception { repoUri = "hdfs://" + getDFS().getUri().getAuthority() + "/tmp/data";"delete", unpartitioned, "-r", repoUri, "-d", "target/data"); File csvFile = temp.newFile("users.csv"); csvFile.delete(); String csv = csvFile.toString(); BufferedWriter writer = Files.newWriter( csvFile, CSVSchemaCommand.SCHEMA_CHARSET); writer.append("id,username,email\n"); numRecords = 30; for(int i = 0; i < numRecords; i++) { writer.append(i+",test"+i+",test"+i+"\n"); } writer.close();"-v", "csv-schema", csv, "-o", avsc, "--class", "User");"create", unpartitioned, "-s", avsc, "-r", repoUri, "-d", "target/data"); URI dsUri ="repo:" + repoUri, "default", partitioned); Datasets.<Object, Dataset<Object>>create(dsUri, new DatasetDescriptor.Builder() .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 2) .build()) .schema(SchemaBuilder.record("User").fields() .requiredLong("id") .optionalString("username") .optionalString("email") .endRecord()) .build(), Object.class);"csv-import", csv, unpartitioned, "-r", repoUri, "-d", "target/data");"csv-import", csv, partitioned, "-r", repoUri, "-d", "target/data"); }
Example 8
Source File: From kite-examples with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { // where the schema is stored URI schemaURI = URI.create("resource:simple-log.avsc"); // create a Parquet dataset for long-term storage Datasets.create("dataset:file:/tmp/data/logs", new DatasetDescriptor.Builder() .format(Formats.PARQUET) .schemaUri(schemaURI) .partitionStrategy(new PartitionStrategy.Builder() .year("timestamp", "year") .month("timestamp", "month") .day("timestamp", "day") .build()) .build(), Record.class); // create an Avro dataset to temporarily hold data Datasets.create("dataset:file:/tmp/data/logs_staging", new DatasetDescriptor.Builder() .format(Formats.AVRO) .schemaUri(schemaURI) .partitionStrategy(new PartitionStrategy.Builder() .day("timestamp", "day") .build()) .build(), Record.class); return 0; }
Example 9
Source File: From kite with Apache License 2.0 | 5 votes |
@Test public void testCreateChangedAbsolutePathURIMissingNamespaceWithURILocation() { String uri = "dataset:hive:/ds?location=file:/tmp/data/ns/ds"; Datasets.create(uri, DESCRIPTOR); Table table = metastore.getTable("default", "ds"); Assert.assertNotNull("Table should be found under default.ds", table); Assert.assertTrue("Should create an external table", HiveAbstractMetadataProvider.isExternal(table)); Assert.assertTrue(Datasets.delete(uri)); }
Example 10
Source File: From kite-examples with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { // Create a dataset of users with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:user.avsc") .build(); Dataset<Record> users = Datasets.create("dataset:hive?dataset=users", descriptor, Record.class); // Get a writer for the dataset and write some users to it DatasetWriter<Record> writer = null; try { writer = users.newWriter(); Random rand = new Random(); GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema()); for (int i = 0; i < 100; i++) { Record record = builder.set("username", "user-" + i) .set("creationDate", System.currentTimeMillis()) .set("favoriteColor", colors[rand.nextInt(colors.length)]).build(); writer.write(record); } } finally { if (writer != null) { writer.close(); } } return 0; }
Example 11
Source File: From kite-examples with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { // Create a dataset of products with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(Product.class) .build(); Dataset<Product> products = Datasets.create( "dataset:hdfs:/tmp/data/products", descriptor, Product.class); // Get a writer for the dataset and write some products to it DatasetWriter<Product> writer = null; try { writer = products.newWriter(); int i = 0; for (String name : names) { Product product = new Product(); product.setName(name); product.setId(i++); writer.write(product); } } finally { if (writer != null) { writer.close(); } } return 0; }
Example 12
Source File: From localization_nifi with Apache License 2.0 | 5 votes |
@Test public void testBasicStoreToHive() throws IOException { String datasetUri = "dataset:hive:ns/test"; Dataset<Record> dataset = Datasets.create(datasetUri, descriptor, Record.class); TestRunner runner = TestRunners.newTestRunner(StoreInKiteDataset.class); runner.assertNotValid(); runner.setProperty(StoreInKiteDataset.KITE_DATASET_URI, datasetUri); runner.assertValid(); List<Record> users = Lists.newArrayList( user("a", ""), user("b", ""), user("c", "") ); runner.enqueue(streamFor(users));; runner.assertAllFlowFilesTransferred("success", 1); List<Record> stored = Lists.newArrayList( (Iterable<Record>) dataset.newReader()); Assert.assertEquals("Records should match", users, stored); Datasets.delete(datasetUri); }
Example 13
Source File: From kite-examples with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { // Create a dataset of users with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:user.avsc") .build(); Dataset<Record> users = Datasets.create( "dataset:hdfs:/tmp/data/users", descriptor, Record.class); // Get a writer for the dataset and write some users to it DatasetWriter<Record> writer = null; try { writer = users.newWriter(); Random rand = new Random(); GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema()); for (int i = 0; i < 100; i++) { Record record = builder.set("username", "user-" + i) .set("creationDate", System.currentTimeMillis()) .set("favoriteColor", colors[rand.nextInt(colors.length)]).build(); writer.write(record); } } finally { if (writer != null) { writer.close(); } } return 0; }
Example 14
Source File: From kite-examples with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { // Create a partition strategy that hash partitions on username with 10 buckets PartitionStrategy partitionStrategy = new PartitionStrategy.Builder() .identity("favoriteColor", "favorite_color") .build(); // Create a dataset of users with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:user.avsc") .partitionStrategy(partitionStrategy) .build(); Dataset<Record> users = Datasets.create( "dataset:hdfs:/tmp/data/users", descriptor, Record.class); // Get a writer for the dataset and write some users to it DatasetWriter<Record> writer = null; try { writer = users.newWriter(); Random rand = new Random(); GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema()); for (int i = 0; i < 100; i++) { Record record = builder.set("username", "user-" + i) .set("creationDate", System.currentTimeMillis()) .set("favoriteColor", colors[rand.nextInt(colors.length)]).build(); writer.write(record); } } finally { if (writer != null) { writer.close(); } } return 0; }
Example 15
Source File: From nifi with Apache License 2.0 | 5 votes |
@Before public void createDataset() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(TestUtil.USER_SCHEMA) .build(); this.datasetUri = "dataset:file:" + temp.newFolder("ns", "temp").toString(); this.dataset = Datasets.create(datasetUri, descriptor, Record.class); }
Example 16
Source File: From kite with Apache License 2.0 | 5 votes |
@Test public void testAbsolutePathURI() { // recognized as a absolute because there are 3 path components String uri = "dataset:hive:/tmp/data/ns/ds"; Datasets.create(uri, DESCRIPTOR); Table table = metastore.getTable("ns", "ds"); Assert.assertNotNull("Table should be found under ns.ds", table); Assert.assertTrue("Should create an external table", HiveAbstractMetadataProvider.isExternal(table)); Assert.assertTrue(Datasets.delete(uri)); }
Example 17
Source File: From kite with Apache License 2.0 | 5 votes |
@Test public void testCreateChangedAbsolutePathURIWithURILocation() { String uri = "dataset:hive:/ns/ds?location=file:/tmp/data/ns/ds"; Datasets.create(uri, DESCRIPTOR); Table table = metastore.getTable("ns", "ds"); Assert.assertNotNull("Table should be found under ns.ds", table); Assert.assertTrue("Should create an external table", HiveAbstractMetadataProvider.isExternal(table)); Assert.assertTrue(Datasets.delete(uri)); }
Example 18
Source File: From kite with Apache License 2.0 | 5 votes |
@BeforeClass public static void createTestDataset() { Datasets.delete("dataset:file:/tmp/test_name"); test = Datasets.create("dataset:file:/tmp/test_name", new DatasetDescriptor.Builder() .schema(SCHEMA) .partitionStrategy(STRATEGY) .build()); }
Example 19
Source File: From kite with Apache License 2.0 | 5 votes |
@Test public void testRelativePathURI() { // recognized as a deprecated form because there are 3 path components String uri = "dataset:hive:data/ns/ds"; Datasets.create(uri, DESCRIPTOR); Table table = metastore.getTable("ns", "ds"); Assert.assertNotNull("Table should be found under ns.ds", table); Assert.assertTrue("Should create an external table", HiveAbstractMetadataProvider.isExternal(table)); Assert.assertTrue(Datasets.delete(uri)); }
Example 20
Source File: From kite with Apache License 2.0 | 4 votes |
@Test public void testBasics3a() { // only run this test if credentials are present Assume.assumeTrue(ID != null && !ID.isEmpty()); String uri = "dataset:s3a://" + BUCKET + "/ns/test"; // make sure the dataset doesn't already exist Datasets.delete(uri); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral("\"string\"") .build(); Dataset<String> dataset = Datasets.create(uri, descriptor, String.class); List<String> expected = Lists.newArrayList("a", "b", "time"); DatasetWriter<String> writer = null; try { writer = dataset.newWriter(); for (String s : expected) { writer.write(s); } } finally { if (writer != null) { writer.close(); } } DatasetReader<String> reader = null; try { reader = dataset.newReader(); Assert.assertEquals("Should match written strings", expected, Lists.newArrayList((Iterator<String>) reader)); } finally { if (reader != null) { reader.close(); } } // clean up Datasets.delete(uri); }