org.kitesdk.data.Formats Java Exaples

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

6 votes

@Test
public void testMultipleAvroFilesInOneFolder() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files in parent
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, parent);
  createAvroUserFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}

Source File: AvroAppender.java From kite with Apache License 2.0

6 votes

private CodecFactory getCodecFactory() {
  switch (compressionType) {
    case Snappy:
      return CodecFactory.snappyCodec();

    case Deflate:
      return CodecFactory.deflateCodec(9);

    case Bzip2:
      return CodecFactory.bzip2Codec();

    default:
      throw new IllegalArgumentException(String.format(
          "Unsupported compression format %s. Supported formats: %s",
          compressionType.getName(), Arrays.toString(
              Formats.AVRO.getSupportedCompressionTypes().toArray())));
  }
}

Source File: FileSystemUtil.java From kite with Apache License 2.0

6 votes

@Override
Result file(FileSystem fs, Path path) throws IOException {
  Format format = formatFromExt(path);
  Schema schema = null;
  if (format == Formats.AVRO) {
    schema = Schemas.fromAvro(fs, path);
  } else if (format == Formats.PARQUET) {
    schema = Schemas.fromParquet(fs, path);
  } else if (format == Formats.JSON) {
    schema = Schemas.fromJSON("record", fs, path);
  }

  if (schema == null) {
    return new Result.Unknown();
  }

  return new Result.Table(path, format, schema, path.depth());
}

Source File: MultiFileDatasetReader.java From kite with Apache License 2.0

6 votes

@SuppressWarnings("unchecked") // See https://github.com/Parquet/parquet-mr/issues/106
private void openNextReader() {
  if (Formats.PARQUET.equals(descriptor.getFormat())) {
    this.reader = new ParquetFileSystemDatasetReader(fileSystem,
        filesIter.next(), accessor.getReadSchema(), accessor.getType());
  } else if (Formats.JSON.equals(descriptor.getFormat())) {
    this.reader = new JSONFileReader<E>(
        fileSystem, filesIter.next(), accessor);
  } else if (Formats.CSV.equals(descriptor.getFormat())) {
    this.reader = new CSVFileReader<E>(fileSystem, filesIter.next(),
        descriptor, accessor);
  } else if (Formats.INPUTFORMAT.equals(descriptor.getFormat())) {
    this.reader = new InputFormatReader(fileSystem, filesIter.next(), descriptor);
  } else {
    this.reader = new FileSystemDatasetReader<E>(fileSystem, filesIter.next(),
        accessor.getReadSchema(), accessor.getType());
  }
  reader.initialize();
  this.readerIterator = Iterators.filter(reader,
      constraints.toEntityPredicate(
          (pathIter != null ? pathIter.getStorageKey() : null), accessor));
}

Source File: ParquetAppender.java From kite with Apache License 2.0

6 votes

private CompressionCodecName getCompressionCodecName() {
  switch (compressionType) {
    case Snappy:
      return CompressionCodecName.SNAPPY;

    case Lzo:
      return CompressionCodecName.LZO;

    case Deflate:
      return CompressionCodecName.GZIP;

    default:
      throw new IllegalArgumentException(String.format(
          "Unsupported compression format %s. Supported formats: %s",
          compressionType.getName(), Arrays.toString(
              Formats.PARQUET.getSupportedCompressionTypes().toArray())));
  }
}

Source File: PartitionedDatasetWriter.java From kite with Apache License 2.0

6 votes

static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) {
  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  Format format = descriptor.getFormat();
  if (Formats.PARQUET.equals(format)) {
    // by default, Parquet is not durable
    if (DescriptorUtil.isDisabled(
        FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
      return new IncrementalPartitionedDatasetWriter<E>(view);
    } else {
      return new NonDurablePartitionedDatasetWriter<E>(view);
    }
  } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
    return new IncrementalPartitionedDatasetWriter<E>(view);
  } else {
    return new NonDurablePartitionedDatasetWriter<E>(view);
  }
}

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

6 votes

@Test
public void testSingleParquetFile() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a single Avro file
  Path parent = new Path(folder.toURI());
  createParquetEventFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri().getPath(),
      parent(descriptor.getLocation()).getPath());
  Assert.assertTrue("Should be a .parquet file",
      descriptor.getLocation().toString().endsWith(".parquet"));
  Assert.assertEquals("Should use event schema",
      EVENT_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Parquet format",
      Formats.PARQUET, descriptor.getFormat());
  Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

6 votes

@Test
public void testMultipleParquetFilesInOneFolder() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a single Avro file
  Path parent = new Path(folder.toURI());
  createParquetEventFile(fs, parent);
  createParquetEventFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use event schema",
      EVENT_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Parquet format",
      Formats.PARQUET, descriptor.getFormat());
  Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testSourceView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testPartitionedSource() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputDataset));
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testGenericParquet() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}

Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0

6 votes

@Test
public void testUpdateFailsWithFormatChange() {
  Dataset<Record> dataset = repo.create(NAMESPACE, NAME,
      new DatasetDescriptor.Builder(testDescriptor)
          .format(Formats.AVRO)
          .build());

  DatasetDescriptor changed =
      new DatasetDescriptor.Builder(dataset.getDescriptor())
      .format(Formats.PARQUET)
      .build();

  try {
    repo.update(NAMESPACE, NAME, changed);
    Assert.fail("Should fail due to format change");
  } catch (ValidationException e) {
    // expected
  }

  Assert.assertEquals(
      Formats.AVRO, repo.load(NAMESPACE, NAME).getDescriptor().getFormat());
}

Source File: TestCreateDatasetWithExistingData.java From kite with Apache License 2.0

6 votes

@Test
public void testCreateFromExistingPartitioned() throws Exception {
  command.datasets = Lists.newArrayList(existingPartitionedURI);
  command.run();

  verify(console).debug(contains("Created"), eq(existingPartitionedURI));

  PartitionStrategy providedVersionStrategy = new PartitionStrategy.Builder()
      .provided("version", "int")
      .build();

  // load the new dataset and verify it
  Dataset<GenericRecord> users = Datasets.load(existingPartitionedURI);
  Assert.assertEquals("Schema should match",
      USER_SCHEMA, users.getDescriptor().getSchema());
  Assert.assertEquals("Should be partitioned with a provided partitioner",
      providedVersionStrategy, users.getDescriptor().getPartitionStrategy());
  Assert.assertEquals("Should be Parquet",
      Formats.PARQUET, users.getDescriptor().getFormat());
}

Source File: TestCreateDatasetWithExistingData.java From kite with Apache License 2.0

6 votes

@Test
public void testCreateFromExistingWithLocation() throws Exception {
  command.datasets = Lists.newArrayList(existingDataURI);
  command.location = existingPartitionedPathWithPartition.toString();
  command.run();

  verify(console).debug(contains("Created"), eq(existingDataURI));

  // load the new dataset and verify it
  Dataset<GenericRecord> users = Datasets.load(existingDataURI);
  Assert.assertEquals("Schema should match",
      USER_SCHEMA, users.getDescriptor().getSchema());
  Assert.assertFalse("Should not be partitioned",
      users.getDescriptor().isPartitioned());
  Assert.assertEquals("Should be Parquet",
      Formats.PARQUET, users.getDescriptor().getFormat());
  Assert.assertTrue("Location should point to the partitioned data",
      String.valueOf(users.getDescriptor().getLocation())
          .endsWith(existingPartitionedPathWithPartition.toString()));
}

Source File: TestCreateDatasetWithExistingData.java From kite with Apache License 2.0

6 votes

@Test
public void testCreateFromExisting() throws Exception {
  command.datasets = Lists.newArrayList(existingDataURI);
  command.run();

  verify(console).debug(contains("Created"), eq(existingDataURI));

  // load the new dataset and verify it
  Dataset<GenericRecord> users = Datasets.load(existingDataURI);
  Assert.assertEquals("Schema should match",
      USER_SCHEMA, users.getDescriptor().getSchema());
  Assert.assertFalse("Should not be partitioned",
      users.getDescriptor().isPartitioned());
  Assert.assertEquals("Should be Parquet",
      Formats.PARQUET, users.getDescriptor().getFormat());
}

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

6 votes

@Test
public void testSingleAvroFile() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a single Avro file
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri().getPath(),
      parent(descriptor.getLocation()).getPath());
  Assert.assertTrue("Should be a .avro file",
      descriptor.getLocation().toString().endsWith(".avro"));
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}

Source File: FileSystemWriter.java From kite with Apache License 2.0

6 votes

@VisibleForTesting
@SuppressWarnings("unchecked")
<E> FileAppender<E> newAppender(Path temp) {
  Format format = descriptor.getFormat();
  if (Formats.PARQUET.equals(format)) {
    // by default, Parquet is not durable
    if (DescriptorUtil.isDisabled(
        FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
      return (FileAppender<E>) new DurableParquetAppender(
          fs, temp, schema, conf, descriptor.getCompressionType());
    } else {
      return (FileAppender<E>) new ParquetAppender(
          fs, temp, schema, conf,
          descriptor.getCompressionType());
    }
  } else if (Formats.AVRO.equals(format)) {
    return new AvroAppender<E>(fs, temp, schema,
        descriptor.getCompressionType());
  } else if (Formats.CSV.equals(format) &&
      DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)) {
    return new CSVAppender<E>(fs, temp, descriptor);
  } else {
    this.state = ReaderWriterState.ERROR;
    throw new UnknownFormatException("Unknown format " + descriptor);
  }
}

Source File: FileSystemWriter.java From kite with Apache License 2.0

6 votes

static <E> FileSystemWriter<E> newWriter(FileSystem fs, Path path,
                                         long rollIntervalMillis,
                                         long targetFileSize,
                                         DatasetDescriptor descriptor, Schema writerSchema) {
  Format format = descriptor.getFormat();
  if (Formats.PARQUET.equals(format)) {
    // by default, Parquet is not durable
    if (DescriptorUtil.isDisabled(
        FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
      return new IncrementalWriter<E>(
          fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
    } else {
      return new FileSystemWriter<E>(
          fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
    }
  } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
    return new IncrementalWriter<E>(
        fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
  } else {
    return new FileSystemWriter<E>(
        fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
  }
}

Source File: TestFileSystemDataset.java From kite with Apache License 2.0

6 votes

@Test(expected = ValidationException.class)
public void testCannotMergeDatasetsWithDifferentFormats() throws IOException {
  FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>()
      .namespace("ns")
      .name("users")
      .configuration(getConfiguration())
      .descriptor(new DatasetDescriptor.Builder()
          .schema(USER_SCHEMA)
          .format(Formats.AVRO)
          .location(testDirectory)
          .build())
      .type(Record.class)
      .build();
  FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>()
      .namespace("ns")
      .name("users")
      .configuration(getConfiguration())
      .descriptor(new DatasetDescriptor.Builder()
          .schema(USER_SCHEMA)
          .format(Formats.PARQUET)
          .location(testDirectory)
          .build())
      .type(Record.class)
      .build();
  ds.merge(dsUpdate);
}

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

5 votes

@Test
public void testPartitionedDatasetWithEscapedChars() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e/dataset_name");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();
  URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA)
      .partitionStrategy(new PartitionStrategy.Builder()
          .provided("s")
          .build())
      .build();

  Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor);

  // write two so that the descriptor uses the directory rather than a file
  writeUserToView(dataset.with("s", "test/-0"));
  writeUserToView(dataset.with("s", "test/-0"));

  Path datasetPath = new Path(folder.toURI());
  Path partitionPath = new Path(datasetPath, "s=test%2F-0");

  DatasetDescriptor actual = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Location should be at the partition directory",
      URI.create(partitionPath.toString()), actual.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, actual.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, actual.getFormat());
  Assert.assertFalse("Should not be partitioned", actual.isPartitioned());
}

Source File: FileSystemDataset.java From kite with Apache License 2.0

5 votes

FileSystemDataset(FileSystem fileSystem, Path directory,
                  String namespace, String name,
                  DatasetDescriptor descriptor, URI uri,
                  @Nullable PartitionListener partitionListener,
                  Class<E> type) {
  super(type, descriptor.getSchema());
  if (Formats.PARQUET.equals(descriptor.getFormat())) {
    Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) ||
        type == Object.class,
        "Parquet only supports generic and specific data models, type"
        + " parameter must implement IndexedRecord");
  }

  this.fileSystem = fileSystem;
  this.directory = directory;
  this.namespace = namespace;
  this.name = name;
  this.descriptor = descriptor;
  this.partitionStrategy =
      descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null;
  this.partitionListener = partitionListener;
  this.convert = new PathConversion(descriptor.getSchema());
  this.uri = uri;

  Path signalsPath = new Path(getDirectory(fileSystem, directory),
      SIGNALS_DIRECTORY_NAME);
  this.signalManager = new SignalManager(fileSystem, signalsPath);
  this.unbounded = new FileSystemPartitionView<E>(
      this, partitionListener, signalManager, type);

  // remove this.partitionKey for 0.14.0
  this.partitionKey = null;
}

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

5 votes

@Test
public void testMultipleAvroFilesInSeparateFolders() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files under separate folders
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, new Path(parent, "part=1"));
  createAvroUserFile(fs, new Path(parent, "2"));

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  PartitionStrategy strategy = new PartitionStrategy.Builder()
      .provided("part", "int")
      .build();

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertEquals("Should be partitioned by part=int",
      strategy, descriptor.getPartitionStrategy());
}

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

5 votes

@Test
public void testMultipleAvroFilesAtDifferentDepths() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files under separate folders
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, new Path(parent, "part=1"));
  createAvroUserFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  PartitionStrategy strategy = new PartitionStrategy.Builder()
      .provided("part", "int")
      .build();

  Assert.assertTrue("Should flag data at mixed depth in the directory tree",
      DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertEquals("Should be partitioned by part=int",
      strategy, descriptor.getPartitionStrategy());
}

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

5 votes

@Test
public void testMultipleMergeTablesAtDifferentDepths() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files under separate folders
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, new Path(parent, "part=1"));
  createAvroUserFile(fs, new Path(parent, "part=1"));
  createAvroUserFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  PartitionStrategy strategy = new PartitionStrategy.Builder()
      .provided("part", "int")
      .build();

  Assert.assertTrue("Should flag data at mixed depth in the directory tree",
      DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertEquals("Should be partitioned by part=int",
      strategy, descriptor.getPartitionStrategy());
}

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

5 votes

@Test
public void testMultipleParquetFilesInSeparateFolders() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files under separate folders
  Path parent = new Path(folder.toURI());
  createParquetEventFile(fs, new Path(parent, "part"));
  createParquetEventFile(fs, new Path(parent, "2"));

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  PartitionStrategy strategy = new PartitionStrategy.Builder()
      .provided("partition_1", "string")
      .build();

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      EVENT_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Parquet format",
      Formats.PARQUET, descriptor.getFormat());
  Assert.assertEquals("Should be partitioned by part=int",
      strategy, descriptor.getPartitionStrategy());
}

Source File: TestMetadataProviders.java From kite with Apache License 2.0

5 votes

@Before
public void setUp() throws IOException, URISyntaxException {
  this.conf = (distributed ?
      MiniDFSTest.getConfiguration() :
      new Configuration());
  this.testDescriptor = new DatasetDescriptor.Builder()
      .format(Formats.AVRO)
      .schema(SchemaBuilder.record("Event").fields()
          .requiredLong("timestamp")
          .requiredString("message")
          .endRecord())
      .partitionStrategy(new PartitionStrategy.Builder()
          .year("timestamp")
          .month("timestamp")
          .day("timestamp")
          .build())
      .build();
  // something completely different
  this.anotherDescriptor = new DatasetDescriptor.Builder()
      .format(Formats.PARQUET)
      .schema(SchemaBuilder.record("Record").fields()
          .requiredBytes("some_field")
          .requiredString("another_field")
          .endRecord())
      .partitionStrategy(new PartitionStrategy.Builder()
          .hash("some_field", 20000)
          .build())
      .build();

  this.provider = newProvider(conf);
}

Source File: TestMetadataProviders.java From kite with Apache License 2.0

5 votes

@Test
public void testLargeSchema() {

  // Only run this test in distributed mode, since non-HDFS schema URLs result
  // in the schema being loaded into the Hive metastore, and large schemas
  // can exceed the size limit of that.
  Assume.assumeTrue(distributed);

  Assert.assertFalse("Sanity check", provider.exists(NAMESPACE, "large_schema_test"));

  // Create a schema with many fields to ensure the underlying store can handle it.
  SchemaBuilder.FieldAssembler<Schema> fields = SchemaBuilder.record("Event").fields();

  for (int i = 0; i < 1000; ++i) {
    fields.requiredString("field_" + i);
  }

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
          .format(Formats.AVRO)
          .schema(fields.endRecord())
          .build();

  DatasetDescriptor created = provider.create(NAMESPACE, "large_schema_test", descriptor);

  Assert.assertEquals("Large schemas should match",
          descriptor.getSchema(), created.getSchema());
}

Source File: TestExternalBackwardCompatibility.java From kite with Apache License 2.0

5 votes

@Test
public void testCreateFailsIfNotCompatible() {
  // this will fail because the new descriptor uses a different format
  // the old descriptor is found and used to validate the change
  TestHelpers.assertThrows("Create should fail because of a format change",
      ValidationException.class, new Runnable() {
        @Override
        public void run() {
          Datasets.create("dataset:hive:/tmp/datasets/test",
              new DatasetDescriptor.Builder(descriptor)
                  .format(Formats.PARQUET)
                  .build());
        }
      });
}

Source File: TestExternalBackwardCompatibility.java From kite with Apache License 2.0

5 votes

@Test
public void testCreateIncompatibleSucceedsWithLocation() {
  // if there is a requested location then the default table isn't checked
  // because only the default location would have been used
  Assert.assertNotNull("Create should succeed if location doesn't match",
      Datasets.create("dataset:hive:/tmp/datasets/test",
          new DatasetDescriptor.Builder(descriptor)
          .location(URI.create("file:/tmp/test-data/test"))
          .format(Formats.PARQUET)
          .build()));
}

Source File: TestExternalBackwardCompatibility.java From kite with Apache License 2.0

5 votes

@Test
public void testUpdateValidatesAgainstDefaultNamespace() {
  TestHelpers.assertThrows("Update should fail because of a format change",
      ValidationException.class, new Runnable() {
        @Override
        public void run() {
          Datasets.update("dataset:hive:/tmp/datasets/test",
              new DatasetDescriptor.Builder(descriptor)
                  .format(Formats.PARQUET)
                  .build());
        }
      });
}

org.kitesdk.data.Formats Java Examples