org.apache.crunch.PipelineResult#succeeded

Source File: JoinFilterExampleCrunch.java From hadoop-arch-book with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {

    String fooInputPath = args[0];
    String barInputPath = args[1];
    String outputPath = args[2];
    int fooValMax = Integer.parseInt(args[3]);
    int joinValMax = Integer.parseInt(args[4]);
    int numberOfReducers = Integer.parseInt(args[5]);

    Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
    
    PCollection<String> fooLines = pipeline.readTextFile(fooInputPath);  //<2>
    PCollection<String> barLines = pipeline.readTextFile(barInputPath);

    PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo(  //<3>
        new FooIndicatorFn(),
        Avros.tableOf(Avros.longs(),
        Avros.pairs(Avros.longs(), Avros.ints())));

    fooTable = fooTable.filter(new FooFilter(fooValMax));  //<4>

    PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
        Avros.tableOf(Avros.longs(), Avros.ints()));

    DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy =   //<5>
        new DefaultJoinStrategy
          <Long, Pair<Long, Integer>, Integer>
          (numberOfReducers);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
        .join(fooTable, barTable, JoinType.INNER_JOIN);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));

    filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>

    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
  }

Source File: StagingToPersistent.java From kite-examples with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
  final long startOfToday = startOfDay();

  // the destination dataset
  Dataset<Record> persistent = Datasets.load(
      "dataset:file:/tmp/data/logs", Record.class);

  // the source: anything before today in the staging area
  Dataset<Record> staging = Datasets.load(
      "dataset:file:/tmp/data/logs_staging", Record.class);
  View<Record> ready = staging.toBefore("timestamp", startOfToday);

  ReadableSource<Record> source = CrunchDatasets.asSource(ready);

  PCollection<Record> stagedLogs = read(source);

  getPipeline().write(stagedLogs,
      CrunchDatasets.asTarget(persistent), Target.WriteMode.APPEND);

  PipelineResult result = run();

  if (result.succeeded()) {
    // remove the source data partition from staging
    ready.deleteAll();
    return 0;
  } else {
    return 1;
  }
}

Source File: WordCount.java From tutorials with MIT License

5 votes

public int run(String[] args) throws Exception {

        if (args.length != 2) {
            System.err.println("Usage: hadoop jar crunch-1.0.0-SNAPSHOT-job.jar" + " [generic options] input output");
            System.err.println();
            GenericOptionsParser.printGenericCommandUsage(System.err);
            return 1;
        }

        String inputPath = args[0];
        String outputPath = args[1];

        // Create an object to coordinate pipeline creation and execution.
        Pipeline pipeline = new MRPipeline(WordCount.class, getConf());

        // Reference a given text file as a collection of Strings.
        PCollection<String> lines = pipeline.readTextFile(inputPath);

        // Define a function that splits each line in a PCollection of Strings into
        // a PCollection made up of the individual words in the file.
        // The second argument sets the serialization format.
        PCollection<String> words = lines.parallelDo(new Tokenizer(), Writables.strings());

        // Take the collection of words and remove known stop words.
        PCollection<String> noStopWords = words.filter(new StopWordFilter());

        // The count method applies a series of Crunch primitives and returns
        // a map of the unique words in the input PCollection to their counts.
        PTable<String, Long> counts = noStopWords.count();

        // Instruct the pipeline to write the resulting counts to a text file.
        pipeline.writeTextFile(counts, outputPath);

        // Execute the pipeline as a MapReduce.
        PipelineResult result = pipeline.done();

        return result.succeeded() ? 0 : 1;
    }

Source File: LegacyHdfs2Cass.java From hdfs2cass with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {

  new JCommander(this, args);

  URI outputUri = URI.create(output);

  // Our crunch job is a MapReduce job
  Pipeline pipeline = new MRPipeline(LegacyHdfs2Cass.class, getConf());

  // Parse & fetch info about target Cassandra cluster
  CassandraParams params = CassandraParams.parse(outputUri);

  // Read records from Avro files in inputFolder
  PCollection<ByteBuffer> records =
      pipeline.read(From.avroFile(inputList(input), Avros.records(ByteBuffer.class)));

  // Transform the input
  String protocol = outputUri.getScheme();
  if (protocol.equalsIgnoreCase("thrift")) {
    records
        // First convert ByteBuffers to ThriftRecords
        .parallelDo(new LegacyHdfsToThrift(), ThriftRecord.PTYPE)
        // Then group the ThriftRecords in preparation for writing them
        .parallelDo(new ThriftRecord.AsPair(), ThriftRecord.AsPair.PTYPE)
        .groupByKey(params.createGroupingOptions())
        // Finally write the ThriftRecords to Cassandra
        .write(new ThriftTarget(outputUri, params));
  }
  else if (protocol.equalsIgnoreCase("cql")) {
    records
        // In case of CQL, convert ByteBuffers to CQLRecords
        .parallelDo(new LegacyHdfsToCQL(), CQLRecord.PTYPE)
        .by(params.getKeyFn(), Avros.bytes())
        .groupByKey(params.createGroupingOptions())
        .write(new CQLTarget(outputUri, params));
  }

  // Execute the pipeline
  PipelineResult result = pipeline.done();
  return result.succeeded() ? 0 : 1;
}

Source File: CompactCommand.java From kite with Apache License 2.0

5 votes

@Override
public int run() throws IOException {
  Preconditions.checkArgument(datasets.size() == 1,
      "Cannot compact multiple datasets");

  String uriOrName = datasets.get(0);
  View<Record> view = load(uriOrName, Record.class);

  if (isDatasetOrViewUri(uriOrName)) {
    Preconditions.checkArgument(viewMatches(view.getUri(), uriOrName),
        "Resolved view does not match requested view: " + view.getUri());
  }

  CompactionTask task = new CompactionTask<Record>(view);

  task.setConf(getConf());

  if (numWriters >= 0) {
    task.setNumWriters(numWriters);
  }

  if (filesPerPartition > 0) {
    task.setFilesPerPartition(filesPerPartition);
  }

  PipelineResult result = task.run();

  if (result.succeeded()) {
    console.info("Compacted {} records in \"{}\"",
        task.getCount(), uriOrName);
    return 0;
  } else {
    return 1;
  }
}

Source File: Hdfs2Cass.java From hdfs2cass with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {

  new JCommander(this, args);

  URI outputUri = URI.create(output);

  // Our crunch job is a MapReduce job
  Configuration conf = getConf();
  conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, Boolean.FALSE);
  conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, Boolean.FALSE);
  Pipeline pipeline = new MRPipeline(Hdfs2Cass.class, conf);

  // Parse & fetch info about target Cassandra cluster
  CassandraParams params = CassandraParams.parse(outputUri);

  PCollection<GenericRecord> records =
      ((PCollection<GenericRecord>)(PCollection) pipeline.read(From.avroFile(inputList(input))));

  String protocol = outputUri.getScheme();
  if (protocol.equalsIgnoreCase("thrift")) {
    records
        // First convert ByteBuffers to ThriftRecords
        .parallelDo(new AvroToThrift(rowkey, timestamp, ttl, ignore), ThriftRecord.PTYPE)
        // Then group the ThriftRecords in preparation for writing them
        .parallelDo(new ThriftRecord.AsPair(), ThriftRecord.AsPair.PTYPE)
        .groupByKey(params.createGroupingOptions())
         // Finally write the ThriftRecords to Cassandra
        .write(new ThriftTarget(outputUri, params));
  }
  else if (protocol.equalsIgnoreCase("cql")) {
    records
        // In case of CQL, convert ByteBuffers to CQLRecords
        .parallelDo(new AvroToCQL(rowkey, timestamp, ttl, ignore), CQLRecord.PTYPE)
        .by(params.getKeyFn(), Avros.bytes())
        .groupByKey(params.createGroupingOptions())
        .write(new CQLTarget(outputUri, params));
  }

  // Execute the pipeline
  PipelineResult result = pipeline.done();
  return result.succeeded() ? 0 : 1;
}

Source File: CopyCommand.java From kite with Apache License 2.0

4 votes

@Override
public int run() throws IOException {
  Preconditions.checkArgument(datasets != null && datasets.size() > 1,
      "Source and target datasets are required");
  Preconditions.checkArgument(datasets.size() == 2,
      "Cannot copy multiple datasets");

  View<GenericRecord> dest = load(datasets.get(1));
  View<GenericRecord> source = load(datasets.get(0))
      .asSchema(dest.getSchema());

  CopyTask task = new CopyTask<GenericRecord>(source, dest);

  task.setConf(getConf());

  if (noCompaction) {
    task.noCompaction();
  }

  if (numWriters >= 0) {
    task.setNumWriters(numWriters);
  }

  if (filesPerPartition > 0) {
    task.setFilesPerPartition(filesPerPartition);
  }

  if (overwrite) {
    task.setWriteMode(Target.WriteMode.OVERWRITE);
  }

  PipelineResult result = task.run();

  if (result.succeeded()) {
    console.info("Added {} records to \"{}\"",
        task.getCount(), datasets.get(1));
    return 0;
  } else {
    return 1;
  }
}

Source File: JSONImportCommand.java From kite with Apache License 2.0

4 votes

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() == 2,
      "JSON path and target dataset name are required.");

  Path source = qualifiedPath(targets.get(0));
  FileSystem sourceFS = source.getFileSystem(getConf());
  Preconditions.checkArgument(sourceFS.exists(source),
      "JSON path does not exist: " + source);

  String dataset = targets.get(1);

  View<Record> target = load(dataset, Record.class);
  Schema datasetSchema = target.getDataset().getDescriptor().getSchema();

  DatasetDescriptor jsonDescriptor = new DatasetDescriptor.Builder()
      .location(source.toUri())
      .schema(ColumnMappingParser.removeEmbeddedMapping(
          PartitionStrategyParser.removeEmbeddedStrategy(datasetSchema)))
      .format("json")
      .build();

  TemporaryFileSystemDatasetRepository repo =
      new TemporaryFileSystemDatasetRepository(getConf(),
          // ensure the same FS as the file source is used
          sourceFS.makeQualified(new Path("/tmp/" + UUID.randomUUID().toString())),
          target.getDataset().getNamespace(),
          UUID.randomUUID().toString());

  try {
    FileSystemDataset<Record> jsonDataset =
        (FileSystemDataset) repo.create("import", "json", jsonDescriptor);

    Iterator<Path> iter = jsonDataset.pathIterator().iterator();
    Preconditions.checkArgument(iter.hasNext(),
        "JSON path has no data files: " + source);

    TaskUtil.configure(getConf()).addJars(jars);

    TransformTask task;
    if (transform != null) {
      DoFn<Record, Record> transformFn;
      try {
        DynConstructors.Ctor<DoFn<Record, Record>> ctor =
            new DynConstructors.Builder(DoFn.class)
                .loader(loaderForJars(jars))
                .impl(transform)
                .buildChecked();
        transformFn = ctor.newInstance();
      } catch (NoSuchMethodException e) {
        throw new DatasetException(
            "Cannot find no-arg constructor for class: " + transform, e);
      }
      task = new TransformTask<Record, Record>(
          jsonDataset, target, transformFn);
    } else {
      task = new CopyTask<Record>(jsonDataset, target);
    }

    task.setConf(getConf());

    if (noCompaction) {
      task.noCompaction();
    }

    if (numWriters >= 0) {
      task.setNumWriters(numWriters);
    }

    if (filesPerPartition > 0) {
      task.setFilesPerPartition(filesPerPartition);
    }

    if (overwrite) {
      task.setWriteMode(Target.WriteMode.OVERWRITE);
    }

    PipelineResult result = task.run();

    if (result.succeeded()) {
      long count = task.getCount();
      if (count > 0) {
        console.info("Added {} records to \"{}\"", count, dataset);
      }
      return 0;
    } else {
      return 1;
    }
  } finally {
    // clean up the temporary repository
    repo.delete();
  }
}

Source File: TransformCommand.java From kite with Apache License 2.0

4 votes

@Override
public int run() throws IOException {
  Preconditions.checkArgument(datasets != null && datasets.size() > 1,
      "Source and target datasets are required");
  Preconditions.checkArgument(datasets.size() == 2,
      "Cannot copy multiple datasets");

  View<Record> source = load(datasets.get(0), Record.class);
  View<Record> dest = load(datasets.get(1), Record.class);

  TaskUtil.configure(getConf()).addJars(jars);

  TransformTask task;
  if (transform != null) {
    DoFn<Record, Record> transformFn;
    try {
      DynConstructors.Ctor<DoFn<Record, Record>> ctor =
          new DynConstructors.Builder(DoFn.class)
              .loader(loaderForJars(jars))
              .impl(transform)
              .buildChecked();
      transformFn = ctor.newInstance();
    } catch (NoSuchMethodException e) {
      throw new DatasetException(
          "Cannot find no-arg constructor for class: " + transform, e);
    }
    task = new TransformTask<Record, Record>(source, dest, transformFn);
  } else {
    task = new CopyTask<Record>(source, dest);
  }

  task.setConf(getConf());

  if (noCompaction) {
    task.noCompaction();
  }

  if (numWriters >= 0) {
    task.setNumWriters(numWriters);
  }

  if (filesPerPartition > 0) {
    task.setFilesPerPartition(filesPerPartition);
  }

  if (overwrite) {
    task.setWriteMode(Target.WriteMode.OVERWRITE);
  }

  PipelineResult result = task.run();

  if (result.succeeded()) {
    console.info("Added {} records to \"{}\"",
        task.getCount(), datasets.get(1));
    return 0;
  } else {
    return 1;
  }
}

Java Code Examples for org.apache.crunch.PipelineResult#succeeded()