org.kitesdk.data.DatasetDescriptor Java Exaples

Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0

6 votes

@Test
public void testUpdateFailsWithFormatChange() {
  Dataset<Record> dataset = repo.create(NAMESPACE, NAME,
      new DatasetDescriptor.Builder(testDescriptor)
          .format(Formats.AVRO)
          .build());

  DatasetDescriptor changed =
      new DatasetDescriptor.Builder(dataset.getDescriptor())
      .format(Formats.PARQUET)
      .build();

  try {
    repo.update(NAMESPACE, NAME, changed);
    Assert.fail("Should fail due to format change");
  } catch (ValidationException e) {
    // expected
  }

  Assert.assertEquals(
      Formats.AVRO, repo.load(NAMESPACE, NAME).getDescriptor().getFormat());
}

Source File: TestWriteReflectReadGeneric.java From kite with Apache License 2.0

6 votes

@BeforeClass
public static void setup() throws IOException {
  fs = LocalFileSystem.getInstance();
  testDirectory = new Path(Files.createTempDir().getAbsolutePath());
  FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(),
      testDirectory);
  Dataset<MyRecord> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder()
                                 .schema(MyRecord.class)
                                 .build(), MyRecord.class);
  DatasetWriter<MyRecord> writer = writerDataset.newWriter();
  for (int i = 0; i < totalRecords; i++) {
    writer.write(new MyRecord(String.valueOf(i), i));
  }
  writer.close();

  readerDataset = repo.load("ns", "test", GenericRecord.class);
}

Source File: HBaseMetadataProvider.java From kite with Apache License 2.0

6 votes

@Override
public DatasetDescriptor load(String namespace, String name) {
  Preconditions.checkArgument(DEFAULT_NAMESPACE.equals(namespace),
      "Non-default namespaces are not supported");
  Preconditions.checkNotNull(name, "Dataset name cannot be null");

  if (!exists(namespace, name)) {
    throw new DatasetNotFoundException("No such dataset: " + name);
  }
  String tableName = getTableName(name);
  String entityName = getEntityName(name);
  return new DatasetDescriptor.Builder()
      .schemaLiteral(schemaManager.getEntitySchema(tableName, entityName)
          .getRawSchema())
      .build();
}

Source File: TestHiveRepositoryURIs.java From kite with Apache License 2.0

6 votes

@Test
public void testExternalURI() {
  URI hdfsUri = getDFS().getUri();
  URI repoUri = URI.create("repo:hive:/tmp/hive-repo?hdfs:host=" +
      hdfsUri.getHost() + "&hdfs:port=" + hdfsUri.getPort());
  DatasetRepository repo = DatasetRepositories.repositoryFor(repoUri);

  Assert.assertNotNull("Received a repository", repo);
  org.junit.Assert.assertTrue("Repo should be a HCatalogExternalDatasetRepository",
      repo instanceof HiveExternalDatasetRepository);
  Assert.assertEquals("Repository URI", repoUri, repo.getUri());

  // verify location
  DatasetDescriptor created = repo.create("tmp", "test",
      new DatasetDescriptor.Builder()
      .schemaLiteral("\"string\"")
      .build()).getDescriptor();
  Assert.assertEquals("Location should be in HDFS",
      "hdfs", created.getLocation().getScheme());
  Assert.assertEquals("Location should have the correct HDFS host",
      hdfsUri.getHost(), created.getLocation().getHost());
  Assert.assertEquals("Location should have the correct HDFS port",
      hdfsUri.getPort(), created.getLocation().getPort());
  Assert.assertTrue("Location should be in the repo path",
      created.getLocation().getPath().startsWith("/tmp/hive-repo"));
}

Source File: FileSystemDatasetRepository.java From kite with Apache License 2.0

6 votes

@Override
public <E> Dataset<E> load(String namespace, String name, Class<E> type) {
  Preconditions.checkNotNull(namespace, "Namespace cannot be null");
  Preconditions.checkNotNull(name, "Dataset name cannot be null");

  LOG.debug("Loading dataset: {}", name);

  DatasetDescriptor descriptor = metadataProvider.load(namespace, name);

  FileSystemDataset<E> ds = new FileSystemDataset.Builder<E>()
      .namespace(namespace)
      .name(name)
      .configuration(conf)
      .descriptor(descriptor)
      .type(type)
      .uri(new URIBuilder(getUri(), namespace, name).build())
      .partitionKey(descriptor.isPartitioned() ? new PartitionKey() : null)
      .partitionListener(getPartitionListener())
      .build();

  LOG.debug("Loaded dataset:{}", ds);

  return ds;
}

Source File: TestMetadataProviders.java From kite with Apache License 2.0

6 votes

@Test
public void testCustomProperties() {
  final String propName = "my.custom.property";
  final String propValue = "string";
  DatasetDescriptor descriptorWithProp =
      new DatasetDescriptor.Builder(testDescriptor)
      .property(propName, propValue)
      .build();

  DatasetDescriptor created = provider.create(NAMESPACE, NAME, descriptorWithProp);
  Assert.assertTrue("Should have custom property",
      created.hasProperty(propName));
  Assert.assertEquals("Should have correct custom property value",
      propValue, created.getProperty(propName));
  Assert.assertTrue("List should contain property name",
      created.listProperties().contains(propName));

  DatasetDescriptor loaded = provider.load(NAMESPACE, NAME);
  Assert.assertTrue("Should have custom property",
      loaded.hasProperty(propName));
  Assert.assertEquals("Should have correct custom property value",
      propValue, loaded.getProperty(propName));
  Assert.assertTrue("List should contain property name",
      created.listProperties().contains(propName));
}

Source File: TestHiveDatasetURIsCompatibility.java From kite with Apache License 2.0

6 votes

@Test
public void testLoadChangedAbsolutePathURICompatibility() {
  // this used to be a relative external URI, but is now a managed URI
  String uri = "dataset:hive:/data/ds";

  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive:/tmp/data");
  DatasetDescriptor withLocation = new DatasetDescriptor.Builder(DESCRIPTOR)
      .location("file:/tmp/data/ds") // old location
      .build();
  Dataset<GenericRecord> expected = repo.create(
      "default", "ds", withLocation, GenericRecord.class);

  Dataset<GenericRecord> actual = Datasets.load(uri);
  Assert.assertEquals("Should load existing dataset default.ds",
      expected, actual);

  Assert.assertEquals("URI should use apparent namespace",
      "dataset:hive:data/ds", actual.getUri().toString());

  Assert.assertTrue(Datasets.delete(uri));
}

Source File: TestHiveExternalDatasetRepository.java From kite with Apache License 2.0

6 votes

@SuppressWarnings("deprecation")
@Test
public void testNewPartitionIsVisibleToHive() throws Exception {
  final String NAME2 = "test2";

  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
      .hash("username", 2).build();

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(testSchema)
      .partitionStrategy(partitionStrategy)
      .build();

  Dataset<GenericRecord> dataset = repo.create(NAMESPACE, NAME2, descriptor);

  HiveTestUtils.assertTableExists(client, NAMESPACE, NAME2);
  HiveTestUtils.assertTableIsExternal(client, NAMESPACE, NAME2);
  Assert.assertTrue("No partitions yet",
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).isEmpty());

  writeRecord(dataset, 0);

  Assert.assertEquals("Should be one partition", 1,
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).size());

}

Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0

6 votes

@Test
public void testUpdateFailsWithLocationChange() {
  ensureCreated();
  Dataset<Record> dataset = repo.load(NAMESPACE, NAME);
  URI location = dataset.getDescriptor().getLocation();

  DatasetDescriptor changed =
      new DatasetDescriptor.Builder(dataset.getDescriptor())
          .location(new Path(testDirectory, "newDataLocation").toUri())
          .build();

  try {
    repo.update(NAMESPACE, NAME, changed);
    Assert.fail("Should fail due to data location change");
  } catch (ValidationException ex) {
    // expected
  }

  Assert.assertEquals(
      location, repo.load(NAMESPACE, NAME).getDescriptor().getLocation());
}

Source File: TestUpdateDatasetCommand.java From kite with Apache License 2.0

6 votes

@Test
public void testUpdateSchema() throws Exception {
  File avroSchemaFile = new File("target/schema_update.avsc");
  new FileWriter(avroSchemaFile).append(schema2).close();

  command.datasets = Lists.newArrayList("users");
  command.avroSchemaFile = avroSchemaFile.toString();
  command.run();

  DatasetDescriptor updated = new DatasetDescriptor.Builder(original)
      .schemaLiteral(schema2)
      .build();

  verify(repo).load("default", "users"); // need to load the current dataset
  verify(ds).getDescriptor(); // should inspect and use its descriptor
  verify(repo).update(eq("default"), eq("users"), argThat(TestUtil.matches(updated)));
  verify(console).debug(contains("Updated"), eq("users"));
}

Source File: TestCreateDatasetCommandCluster.java From kite with Apache License 2.0

6 votes

@Test
public void testBasicUseLocalSchema() throws Exception {
  String avsc = "target/localUser.avsc";
  FSDataOutputStream out = getFS()
      .create(new Path(avsc), true /* overwrite */ );
  ByteStreams.copy(Resources.getResource("test-schemas/user.avsc").openStream(), out);
  out.close();
  command.avroSchemaFile = avsc;
  command.datasets = Lists.newArrayList("users");
  command.run();

  DatasetDescriptor expectedDescriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:test-schemas/user.avsc")
      .build();

  verify(getMockRepo()).create("default", "users", expectedDescriptor);
  verify(console).debug(contains("Created"), eq("users"));
}

Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0

6 votes

@Test
public void testGeneric() throws IOException {
  String datasetName = tableName + ".TestGenericEntity";

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();

  Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor);
  Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor);

  writeRecords(inputDataset, 10);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf());
  PCollection<GenericRecord> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkRecords(outputDataset, 10, 0);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testGeneric() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}

Source File: TestMetadataProviders.java From kite with Apache License 2.0

6 votes

@Test
public void testCreateWithLocation() throws URISyntaxException {
  Assert.assertFalse("Sanity check", provider.exists(NAMESPACE, NAME));

  String auth = getDFS().getUri().getAuthority();
  URI requestedLocation = new URI("hdfs://" + auth + "/tmp/data/my_data_set");
  DatasetDescriptor requested = new DatasetDescriptor.Builder(testDescriptor)
      .location(requestedLocation)
      .build();

  final DatasetDescriptor created;
  try {
    created = provider.create(NAMESPACE, NAME, requested);
  } catch (UnsupportedOperationException ex) {
    // this is expected if the provider doesn't support requested locations
    return;
  }

  // if supported, the location should be unchanged.
  Assert.assertNotNull("Descriptor should be returned", created);
  Assert.assertTrue("Descriptor should exist", provider.exists(NAMESPACE, NAME));
  Assert.assertEquals("Requested locations should match",
      requestedLocation, created.getLocation());
}

Source File: HBaseDatasetReaderTest.java From kite with Apache License 2.0

6 votes

@BeforeClass
public static void beforeClass() throws Exception {
  HBaseTestUtils.getMiniCluster();
  // managed table should be created by HBaseDatasetRepository
  HBaseTestUtils.util.deleteTable(Bytes.toBytes(managedTableName));
  HBaseDatasetRepository repo = new HBaseDatasetRepository.Builder()
      .configuration(HBaseTestUtils.getConf()).build();
  String testGenericEntity = AvroUtils.inputStreamToString(
      HBaseDatasetRepositoryTest.class.getResourceAsStream("/TestGenericEntity.avsc"));
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();
  dataset = repo.create("default", "testtable", descriptor);
  for (int i = 0; i < 10; i++) {
    dataset.put(HBaseDatasetRepositoryTest.createGenericEntity(i));
  }
}

Source File: PartitionedDatasetWriter.java From kite with Apache License 2.0

6 votes

static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) {
  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  Format format = descriptor.getFormat();
  if (Formats.PARQUET.equals(format)) {
    // by default, Parquet is not durable
    if (DescriptorUtil.isDisabled(
        FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
      return new IncrementalPartitionedDatasetWriter<E>(view);
    } else {
      return new NonDurablePartitionedDatasetWriter<E>(view);
    }
  } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
    return new IncrementalPartitionedDatasetWriter<E>(view);
  } else {
    return new NonDurablePartitionedDatasetWriter<E>(view);
  }
}

Source File: FileSystemWriter.java From kite with Apache License 2.0

6 votes

private FileSystemWriter(FileSystem fs, Path path, long rollIntervalMillis,
                         long targetFileSize, DatasetDescriptor descriptor, Schema writerSchema) {
  Preconditions.checkNotNull(fs, "File system is not defined");
  Preconditions.checkNotNull(path, "Destination directory is not defined");
  Preconditions.checkNotNull(descriptor, "Descriptor is not defined");

  this.fs = fs;
  this.directory = path;
  this.rollIntervalMillis = rollIntervalMillis;
  this.targetFileSize = targetFileSize;
  this.descriptor = descriptor;
  this.conf = new Configuration(fs.getConf());
  this.state = ReaderWriterState.NEW;
  this.schema = writerSchema;

  // copy file format settings from custom properties to the Configuration
  for (String prop : descriptor.listProperties()) {
    conf.set(prop, descriptor.getProperty(prop));
  }

  // For performance reasons we will skip temp file creation if the file system does not support
  // efficient renaming, and write the file directly.
  this.useTempPath = FileSystemUtil.supportsRename(fs.getUri(), conf);
}

Source File: TestProjection.java From kite with Apache License 2.0

6 votes

@Test
public void testMixedProjection() throws IOException {
  Dataset<StandardEvent> original = repo.create("ns", "mixedProjection",
      new DatasetDescriptor.Builder()
          .schema(StandardEvent.class)
          .build(), StandardEvent.class);

  DatasetWriter<StandardEvent> writer = null;
  try {
    writer = original.newWriter();
    writer.write(sepEvent);
    writer.write(octEvent);
    writer.write(novEvent);
  } finally {
    Closeables.close(writer, false);
  }

  Dataset<ReflectSmallEvent> dataset = repo.load("ns", original.getName(),
      ReflectSmallEvent.class);

  Set<ReflectSmallEvent> expected = Sets.newHashSet(
      new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent),
      new ReflectSmallEvent(novEvent));

  assertContentEquals(expected, dataset);
}

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

6 votes

private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) {
  Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext);
  Map<String, String> uriOptions = Registration.lookupDatasetUri(
      URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second();
  Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext);

  if (dataset instanceof AbstractDataset) {
    DatasetDescriptor descriptor = dataset.getDescriptor();
    Schema schema = descriptor.getSchema();
    PartitionStrategy strategy = null;
    if (descriptor.isPartitioned()) {
      strategy = descriptor.getPartitionStrategy();
    }
    Constraints constraints = Constraints.fromQueryMap(
        schema, strategy, uriOptions);
    return ((AbstractDataset<E>) dataset).filter(constraints);
  } else {
    return dataset;
  }
}

Source File: TestHiveUtils.java From kite with Apache License 2.0

6 votes

@Test
public void testUpdateChangesDDL() throws Exception {
  DatasetDescriptor original = new DatasetDescriptor.Builder()
      .schema(SchemaBuilder.record("Test").fields()
          .requiredLong("id")
          .requiredString("data")
          .endRecord())
      .build();
  boolean external = false;
  Table table = HiveUtils.tableForDescriptor("ns", "test", original, external);

  DatasetDescriptor updated = new DatasetDescriptor.Builder()
      .schema(SchemaBuilder.record("Test").fields()
          .requiredLong("id")
          .requiredString("data")
          .nullableString("data2", "")
          .endRecord())
      .build();

  HiveUtils.updateTableSchema(table, updated);

  Assert.assertEquals("Should update the table DDL",
      table.getSd().getCols(),
      HiveSchemaConverter.convertSchema(updated.getSchema()));
}

Source File: Compatibility.java From kite with Apache License 2.0

6 votes

/**
 * Checks that the {@code existing} {@link DatasetDescriptor} is compatible
 * with {@code test}.
 *
 * @param existing the current {@code DatasetDescriptor} for a dataset
 * @param test a new {@code DatasetDescriptor} for the same dataset
 */
public static void checkCompatible(DatasetDescriptor existing,
                                   DatasetDescriptor test) {
  checkNotChanged("format", existing.getFormat(), test.getFormat());

  checkNotChanged("partitioning",
      existing.isPartitioned(), test.isPartitioned());

  if (existing.isPartitioned()) {
    checkStrategyUpdate(
        existing.getPartitionStrategy(),
        test.getPartitionStrategy(),
        test.getSchema());
  }

  // check can read records written with old schema using new schema
  Schema oldSchema = existing.getSchema();
  Schema testSchema = test.getSchema();
  if (!SchemaValidationUtil.canRead(oldSchema, testSchema)) {
    throw new IncompatibleSchemaException("Schema cannot read data " +
        "written using existing schema. Schema: " + testSchema.toString(true) +
        "\nExisting schema: " + oldSchema.toString(true));
  }

}

Source File: CSVFileReader.java From kite with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
public CSVFileReader(FileSystem fileSystem, Path path,
                     DatasetDescriptor descriptor,
                     EntityAccessor<E> accessor) {
  this.fs = fileSystem;
  this.path = path;
  this.schema = accessor.getReadSchema();
  this.recordClass = accessor.getType();
  this.state = ReaderWriterState.NEW;
  this.props = CSVProperties.fromDescriptor(descriptor);
  // defaults to false: assume that callers will not make defensive copies
  this.reuseRecords = DescriptorUtil.isEnabled(REUSE_RECORDS, descriptor);

  Preconditions.checkArgument(Schema.Type.RECORD.equals(schema.getType()),
      "Schemas for CSV files must be records of primitive types");
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testSourceView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}

Source File: TestCompatibilityChecks.java From kite with Apache License 2.0

6 votes

@Test
public void testAllowedPartitionSchemaCombinations() {
  Compatibility.checkDescriptor(
      new DatasetDescriptor.Builder()
          .schema(schema)
          .partitionStrategy(new PartitionStrategy.Builder()
              .year("timestamp")
              .month("timestamp")
              .day("timestamp")
              .hour("timestamp")
              .minute("timestamp")
              .identity("message", "message_copy")
              .identity("timestamp", "ts")
              .identity("number", "num")
              .hash("message", 48)
              .hash("timestamp", 48)
              .hash("number", 48)
              .hash("payload", 48)
              .hash("float", 48)
              .hash("double", 48)
              .hash("bool", 48)
              .range("number", 5, 10, 15, 20)
              .range("message", "m", "z", "M", "Z")
              .build())
          .build());
}

Source File: TestConfigurationProperty.java From nifi with Apache License 2.0

5 votes

@Before
public void createDataset() throws Exception {
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
            .schema(TestUtil.USER_SCHEMA)
            .build();
    this.datasetUri = "dataset:file:" + temp.newFolder("ns", "temp").toString();
    this.dataset = Datasets.create(datasetUri, descriptor, Record.class);
}

Source File: DescriptorUtil.java From kite with Apache License 2.0

5 votes

/**
 * Returns whether the value of the descriptor property is {@code true}.
 *
 * @param property a String property name
 * @param descriptor a {@link DatasetDescriptor}
 * @return {@code true} if set and "true", {@code false} otherwise.
 */
public static boolean isEnabled(String property, DatasetDescriptor descriptor) {
  if (descriptor.hasProperty(property)) {
    // return true if and only if the property value is "true"
    return Boolean.valueOf(descriptor.getProperty(property));
  }
  return false;
}

Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0

5 votes

@Override
public AvroKeySchema parseKeySchema(String rawSchema) {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .build();
  return new AvroKeySchema(
      descriptor.getSchema(), descriptor.getPartitionStrategy());
}

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

5 votes

@Test
public void testMultipleAvroFilesAtDifferentDepths() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files under separate folders
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, new Path(parent, "part=1"));
  createAvroUserFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  PartitionStrategy strategy = new PartitionStrategy.Builder()
      .provided("part", "int")
      .build();

  Assert.assertTrue("Should flag data at mixed depth in the directory tree",
      DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertEquals("Should be partitioned by part=int",
      strategy, descriptor.getPartitionStrategy());
}

Source File: TestKiteURIHandler.java From kite with Apache License 2.0

5 votes

@Before
public void setUp() throws IOException, URISyntaxException {
  this.conf = (distributed ?
      MiniDFSTest.getConfiguration() :
      new Configuration());

  this.fs = FileSystem.get(conf);

  this.testDescriptor = new DatasetDescriptor.Builder()
      .format(Formats.AVRO)
      .schema(SchemaBuilder.record("Event").fields()
          .requiredLong("timestamp")
          .requiredString("message")
          .endRecord())
      .partitionStrategy(new PartitionStrategy.Builder()
          .year("timestamp")
          .month("timestamp")
          .day("timestamp")
          .build())
      .build();

  uriHandler = new KiteURIHandler();

  startingConf = DefaultConfiguration.get();

  startingOozieHome = System.getProperty("oozie.home.dir");
}

Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0

5 votes

@Test
public void testUpdateFailsWithIncompatibleSchemaChange() {
  Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder()
      .schema(testSchema).build());

  Assert.assertEquals("Dataset name is propagated", NAME,
      dataset.getName());
  Assert.assertEquals("Dataset schema is propagated", testSchema, dataset
      .getDescriptor().getSchema());

  Schema testSchemaV2 = SchemaBuilder.record("user").fields()
      .requiredString("username")
      .requiredString("email")
      .requiredString("favoriteColor") // incompatible - no default
      .endRecord();

  try {
    repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder(
        dataset.getDescriptor()).schema(testSchemaV2).build());
    Assert.fail("Should fail due to incompatible update");
  } catch (ValidationException e) {
    // expected
  }
  dataset = repo.load(NAMESPACE, NAME);
  Assert.assertEquals("Dataset schema is unchanged", testSchema, dataset
      .getDescriptor().getSchema());
}

org.kitesdk.data.DatasetDescriptor Java Examples