org.apache.crunch.DoFn Java Examples

The following examples show how to use org.apache.crunch.DoFn. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: TransformTask.java From kite with Apache License 2.0

4 votes

public TransformTask(View<S> from, View<T> to, DoFn<S, T> transform) {
  this.from = from;
  this.to = to;
  this.transform = transform;
}

Example #2

Source File: JSONImportCommand.java From kite with Apache License 2.0

4 votes

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() == 2,
      "JSON path and target dataset name are required.");

  Path source = qualifiedPath(targets.get(0));
  FileSystem sourceFS = source.getFileSystem(getConf());
  Preconditions.checkArgument(sourceFS.exists(source),
      "JSON path does not exist: " + source);

  String dataset = targets.get(1);

  View<Record> target = load(dataset, Record.class);
  Schema datasetSchema = target.getDataset().getDescriptor().getSchema();

  DatasetDescriptor jsonDescriptor = new DatasetDescriptor.Builder()
      .location(source.toUri())
      .schema(ColumnMappingParser.removeEmbeddedMapping(
          PartitionStrategyParser.removeEmbeddedStrategy(datasetSchema)))
      .format("json")
      .build();

  TemporaryFileSystemDatasetRepository repo =
      new TemporaryFileSystemDatasetRepository(getConf(),
          // ensure the same FS as the file source is used
          sourceFS.makeQualified(new Path("/tmp/" + UUID.randomUUID().toString())),
          target.getDataset().getNamespace(),
          UUID.randomUUID().toString());

  try {
    FileSystemDataset<Record> jsonDataset =
        (FileSystemDataset) repo.create("import", "json", jsonDescriptor);

    Iterator<Path> iter = jsonDataset.pathIterator().iterator();
    Preconditions.checkArgument(iter.hasNext(),
        "JSON path has no data files: " + source);

    TaskUtil.configure(getConf()).addJars(jars);

    TransformTask task;
    if (transform != null) {
      DoFn<Record, Record> transformFn;
      try {
        DynConstructors.Ctor<DoFn<Record, Record>> ctor =
            new DynConstructors.Builder(DoFn.class)
                .loader(loaderForJars(jars))
                .impl(transform)
                .buildChecked();
        transformFn = ctor.newInstance();
      } catch (NoSuchMethodException e) {
        throw new DatasetException(
            "Cannot find no-arg constructor for class: " + transform, e);
      }
      task = new TransformTask<Record, Record>(
          jsonDataset, target, transformFn);
    } else {
      task = new CopyTask<Record>(jsonDataset, target);
    }

    task.setConf(getConf());

    if (noCompaction) {
      task.noCompaction();
    }

    if (numWriters >= 0) {
      task.setNumWriters(numWriters);
    }

    if (filesPerPartition > 0) {
      task.setFilesPerPartition(filesPerPartition);
    }

    if (overwrite) {
      task.setWriteMode(Target.WriteMode.OVERWRITE);
    }

    PipelineResult result = task.run();

    if (result.succeeded()) {
      long count = task.getCount();
      if (count > 0) {
        console.info("Added {} records to \"{}\"", count, dataset);
      }
      return 0;
    } else {
      return 1;
    }
  } finally {
    // clean up the temporary repository
    repo.delete();
  }
}

Example #3

Source File: TransformCommand.java From kite with Apache License 2.0

4 votes

@Override
public int run() throws IOException {
  Preconditions.checkArgument(datasets != null && datasets.size() > 1,
      "Source and target datasets are required");
  Preconditions.checkArgument(datasets.size() == 2,
      "Cannot copy multiple datasets");

  View<Record> source = load(datasets.get(0), Record.class);
  View<Record> dest = load(datasets.get(1), Record.class);

  TaskUtil.configure(getConf()).addJars(jars);

  TransformTask task;
  if (transform != null) {
    DoFn<Record, Record> transformFn;
    try {
      DynConstructors.Ctor<DoFn<Record, Record>> ctor =
          new DynConstructors.Builder(DoFn.class)
              .loader(loaderForJars(jars))
              .impl(transform)
              .buildChecked();
      transformFn = ctor.newInstance();
    } catch (NoSuchMethodException e) {
      throw new DatasetException(
          "Cannot find no-arg constructor for class: " + transform, e);
    }
    task = new TransformTask<Record, Record>(source, dest, transformFn);
  } else {
    task = new CopyTask<Record>(source, dest);
  }

  task.setConf(getConf());

  if (noCompaction) {
    task.noCompaction();
  }

  if (numWriters >= 0) {
    task.setNumWriters(numWriters);
  }

  if (filesPerPartition > 0) {
    task.setFilesPerPartition(filesPerPartition);
  }

  if (overwrite) {
    task.setWriteMode(Target.WriteMode.OVERWRITE);
  }

  PipelineResult result = task.run();

  if (result.succeeded()) {
    console.info("Added {} records to \"{}\"",
        task.getCount(), datasets.get(1));
    return 0;
  } else {
    return 1;
  }
}