org.apache.crunch.Pipeline#done

Source File: JoinFilterExampleCrunch.java From hadoop-arch-book with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {

    String fooInputPath = args[0];
    String barInputPath = args[1];
    String outputPath = args[2];
    int fooValMax = Integer.parseInt(args[3]);
    int joinValMax = Integer.parseInt(args[4]);
    int numberOfReducers = Integer.parseInt(args[5]);

    Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
    
    PCollection<String> fooLines = pipeline.readTextFile(fooInputPath);  //<2>
    PCollection<String> barLines = pipeline.readTextFile(barInputPath);

    PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo(  //<3>
        new FooIndicatorFn(),
        Avros.tableOf(Avros.longs(),
        Avros.pairs(Avros.longs(), Avros.ints())));

    fooTable = fooTable.filter(new FooFilter(fooValMax));  //<4>

    PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
        Avros.tableOf(Avros.longs(), Avros.ints()));

    DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy =   //<5>
        new DefaultJoinStrategy
          <Long, Pair<Long, Integer>, Integer>
          (numberOfReducers);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
        .join(fooTable, barTable, JoinType.INNER_JOIN);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));

    filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>

    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
  }

Source File: WordCount.java From tutorials with MIT License

5 votes

public int run(String[] args) throws Exception {

        if (args.length != 2) {
            System.err.println("Usage: hadoop jar crunch-1.0.0-SNAPSHOT-job.jar" + " [generic options] input output");
            System.err.println();
            GenericOptionsParser.printGenericCommandUsage(System.err);
            return 1;
        }

        String inputPath = args[0];
        String outputPath = args[1];

        // Create an object to coordinate pipeline creation and execution.
        Pipeline pipeline = new MRPipeline(WordCount.class, getConf());

        // Reference a given text file as a collection of Strings.
        PCollection<String> lines = pipeline.readTextFile(inputPath);

        // Define a function that splits each line in a PCollection of Strings into
        // a PCollection made up of the individual words in the file.
        // The second argument sets the serialization format.
        PCollection<String> words = lines.parallelDo(new Tokenizer(), Writables.strings());

        // Take the collection of words and remove known stop words.
        PCollection<String> noStopWords = words.filter(new StopWordFilter());

        // The count method applies a series of Crunch primitives and returns
        // a map of the unique words in the input PCollection to their counts.
        PTable<String, Long> counts = noStopWords.count();

        // Instruct the pipeline to write the resulting counts to a text file.
        pipeline.writeTextFile(counts, outputPath);

        // Execute the pipeline as a MapReduce.
        PipelineResult result = pipeline.done();

        return result.succeeded() ? 0 : 1;
    }

Source File: LegacyHdfs2Cass.java From hdfs2cass with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {

  new JCommander(this, args);

  URI outputUri = URI.create(output);

  // Our crunch job is a MapReduce job
  Pipeline pipeline = new MRPipeline(LegacyHdfs2Cass.class, getConf());

  // Parse & fetch info about target Cassandra cluster
  CassandraParams params = CassandraParams.parse(outputUri);

  // Read records from Avro files in inputFolder
  PCollection<ByteBuffer> records =
      pipeline.read(From.avroFile(inputList(input), Avros.records(ByteBuffer.class)));

  // Transform the input
  String protocol = outputUri.getScheme();
  if (protocol.equalsIgnoreCase("thrift")) {
    records
        // First convert ByteBuffers to ThriftRecords
        .parallelDo(new LegacyHdfsToThrift(), ThriftRecord.PTYPE)
        // Then group the ThriftRecords in preparation for writing them
        .parallelDo(new ThriftRecord.AsPair(), ThriftRecord.AsPair.PTYPE)
        .groupByKey(params.createGroupingOptions())
        // Finally write the ThriftRecords to Cassandra
        .write(new ThriftTarget(outputUri, params));
  }
  else if (protocol.equalsIgnoreCase("cql")) {
    records
        // In case of CQL, convert ByteBuffers to CQLRecords
        .parallelDo(new LegacyHdfsToCQL(), CQLRecord.PTYPE)
        .by(params.getKeyFn(), Avros.bytes())
        .groupByKey(params.createGroupingOptions())
        .write(new CQLTarget(outputUri, params));
  }

  // Execute the pipeline
  PipelineResult result = pipeline.done();
  return result.succeeded() ? 0 : 1;
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

5 votes

private void runCheckpointPipeline(View<Record> inputView,
    View<Record> outputView) {
  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView),
      Target.WriteMode.CHECKPOINT);
  pipeline.done();
}

Source File: Hdfs2Cass.java From hdfs2cass with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {

  new JCommander(this, args);

  URI outputUri = URI.create(output);

  // Our crunch job is a MapReduce job
  Configuration conf = getConf();
  conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, Boolean.FALSE);
  conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, Boolean.FALSE);
  Pipeline pipeline = new MRPipeline(Hdfs2Cass.class, conf);

  // Parse & fetch info about target Cassandra cluster
  CassandraParams params = CassandraParams.parse(outputUri);

  PCollection<GenericRecord> records =
      ((PCollection<GenericRecord>)(PCollection) pipeline.read(From.avroFile(inputList(input))));

  String protocol = outputUri.getScheme();
  if (protocol.equalsIgnoreCase("thrift")) {
    records
        // First convert ByteBuffers to ThriftRecords
        .parallelDo(new AvroToThrift(rowkey, timestamp, ttl, ignore), ThriftRecord.PTYPE)
        // Then group the ThriftRecords in preparation for writing them
        .parallelDo(new ThriftRecord.AsPair(), ThriftRecord.AsPair.PTYPE)
        .groupByKey(params.createGroupingOptions())
         // Finally write the ThriftRecords to Cassandra
        .write(new ThriftTarget(outputUri, params));
  }
  else if (protocol.equalsIgnoreCase("cql")) {
    records
        // In case of CQL, convert ByteBuffers to CQLRecords
        .parallelDo(new AvroToCQL(rowkey, timestamp, ttl, ignore), CQLRecord.PTYPE)
        .by(params.getKeyFn(), Avros.bytes())
        .groupByKey(params.createGroupingOptions())
        .write(new CQLTarget(outputUri, params));
  }

  // Execute the pipeline
  PipelineResult result = pipeline.done();
  return result.succeeded() ? 0 : 1;
}

Source File: TransformTask.java From kite with Apache License 2.0

4 votes

public PipelineResult run() throws IOException {
  boolean isLocal = (isLocal(from.getDataset()) || isLocal(to.getDataset()));
  if (isLocal) {
    // copy to avoid making changes to the caller's configuration
    Configuration conf = new Configuration(getConf());
    conf.set("mapreduce.framework.name", "local");
    setConf(conf);
  }

  if (isHive(from) || isHive(to)) {
    setConf(addHiveDelegationToken(getConf()));

    // add jars needed for metastore interaction to the classpath
    if (!isLocal) {
      Class<?> fb303Class, thriftClass;
      try {
        // attempt to use libfb303 and libthrift 0.9.2 when async was added
        fb303Class = Class.forName(
            "com.facebook.fb303.FacebookService.AsyncProcessor");
        thriftClass = Class.forName(
            "org.apache.thrift.TBaseAsyncProcessor");
      } catch (ClassNotFoundException e) {
        try {
          // fallback to 0.9.0 or earlier
          fb303Class = Class.forName(
              "com.facebook.fb303.FacebookBase");
          thriftClass = Class.forName(
              "org.apache.thrift.TBase");
        } catch (ClassNotFoundException real) {
          throw new DatasetOperationException(
              "Cannot find thrift dependencies", real);
        }
      }

      TaskUtil.configure(getConf())
          .addJarForClass(Encoder.class) // commons-codec
          .addJarForClass(Log.class) // commons-logging
          .addJarForClass(CompressorInputStream.class) // commons-compress
          .addJarForClass(ApiAdapter.class) // datanucleus-core
          .addJarForClass(JDOAdapter.class) // datanucleus-api-jdo
          .addJarForClass(SQLQuery.class) // datanucleus-rdbms
          .addJarForClass(JDOHelper.class) // jdo-api
          .addJarForClass(Transaction.class) // jta
          .addJarForClass(fb303Class) // libfb303
          .addJarForClass(thriftClass) // libthrift
          .addJarForClass(HiveMetaStore.class) // hive-metastore
          .addJarForClass(HiveConf.class); // hive-exec
    }
  }

  PType<T> toPType = ptype(to);
  MapFn<T, T> validate = new CheckEntityClass<T>(to.getType());

  Pipeline pipeline = new MRPipeline(getClass(), getConf());

  PCollection<T> collection = pipeline.read(CrunchDatasets.asSource(from))
      .parallelDo(transform, toPType).parallelDo(validate, toPType);

  if (compact) {
    // the transform must be run before partitioning
    collection = CrunchDatasets.partition(collection, to, numWriters, numPartitionWriters);
  }

  pipeline.write(collection, CrunchDatasets.asTarget(to), mode);

  PipelineResult result = pipeline.done();

  StageResult sr = Iterables.getFirst(result.getStageResults(), null);
  if (sr != null && MAP_INPUT_RECORDS != null) {
    this.count = sr.getCounterValue(MAP_INPUT_RECORDS);
  }

  return result;
}

Java Code Examples for org.apache.crunch.Pipeline#done()