org.apache.flink.api.java.ExecutionEnvironment#createInput

Source File: AvroTypeExtractionTest.java From flink with Apache License 2.0

6 votes

@Test
public void testKeySelection() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.getConfig().enableObjectReuse();
	Path in = new Path(inFile.getAbsoluteFile().toURI());

	AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
	DataSet<User> usersDS = env.createInput(users);

	DataSet<Tuple2<String, Integer>> res = usersDS
		.groupBy("name")
		.reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> {
			for (User u : values) {
				out.collect(new Tuple2<>(u.getName().toString(), 1));
			}
		})
		.returns(Types.TUPLE(Types.STRING, Types.INT));
	res.writeAsText(resultPath);
	env.execute("Avro Key selection");

	expected = "(Alyssa,1)\n(Charlie,1)\n";
}

Source File: TPCDSQuery55Parquet.java From parquet-flinktacular with Apache License 2.0

6 votes

private static DataSet<Tuple2<Void, DateDimTable>> getDataDimDataSet(ExecutionEnvironment env) throws IOException {
	Job job = Job.getInstance();

	//Schema projection
	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "d_date_sk;d_year;d_moy");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		DateDimTable.class, job);

	// Filter
	LongColumn moy = longColumn("d_moy");
	LongColumn year = longColumn("d_year");
	FilterPredicate moyPred = eq(moy, 11L);
	FilterPredicate yearPred = eq(year, 1999L);
	FilterPredicate constraint = and(moyPred, yearPred);
	ParquetThriftInputFormat.setFilterPredicate(job.getConfiguration(), constraint);

	ParquetThriftInputFormat.addInputPath(job, new Path(datadimPath));

	DataSet<Tuple2<Void, DateDimTable>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: ParquetThriftExample.java From parquet-flinktacular with Apache License 2.0

6 votes

public static DataSet<Tuple2<Void, Person>> readThrift(ExecutionEnvironment env, String inputPath) throws 
	IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, Person
		.class, job);

	// schema projection: don't read attributes id and email
	job.getConfiguration().set("parquet.thrift.column.filter", "name;id;email;phone/number");

	FileInputFormat.addInputPath(job, new Path(inputPath));

	// push down predicates: get all persons with name = "Felix"
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: AvroTypeExtractionTest.java From flink with Apache License 2.0

6 votes

@Test
public void testWithKryoGenericSer() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.getConfig().enableForceKryo();
	Path in = new Path(inFile.getAbsoluteFile().toURI());

	AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
	DataSet<User> usersDS = env.createInput(users);

	DataSet<Tuple2<String, Integer>> res = usersDS
		.groupBy((KeySelector<User, String>) value -> String.valueOf(value.getName()))
		.reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> {
			for (User u : values) {
				out.collect(new Tuple2<>(u.getName().toString(), 1));
			}
		})
		.returns(Types.TUPLE(Types.STRING, Types.INT));

	res.writeAsText(resultPath);
	env.execute("Avro Key selection");

	expected = "(Charlie,1)\n(Alyssa,1)\n";
}

Source File: ReplicatingDataSourceTest.java From Flink-CEPplus with Apache License 2.0

6 votes

/**
 * Tests compiler fail for join program with replicated data source behind rebalance.
 */
@Test(expected = CompilerException.class)
public void checkJoinWithReplicatedSourceInputBehindRebalance() {
	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.rebalance()
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);
}

Source File: FlinkUtil.java From kylin with Apache License 2.0

6 votes

public static DataSet parseInputPath(String inputPath, FileSystem fs, ExecutionEnvironment env, Class keyClass,
        Class valueClass) throws IOException {
    List<String> inputFolders = Lists.newArrayList();
    Path inputHDFSPath = new Path(inputPath);
    FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
    boolean hasDir = false;
    for (FileStatus stat : fileStatuses) {
        if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
            hasDir = true;
            inputFolders.add(stat.getPath().toString());
        }
    }

    if (!hasDir) {
        return env.createInput(HadoopInputs.readSequenceFile(keyClass, valueClass, inputHDFSPath.toString()));
    }

    Job job = Job.getInstance();
    FileInputFormat.setInputPaths(job, StringUtil.join(inputFolders, ","));
    return env.createInput(HadoopInputs.createHadoopInput(new SequenceFileInputFormat(), keyClass, valueClass, job));
}

Source File: ReplicatingDataSourceTest.java From flink with Apache License 2.0

6 votes

/**
 * Tests compiler fail for join program with replicated data source and changing parallelism.
 */
@Test(expected = CompilerException.class)
public void checkJoinWithReplicatedSourceInputChangingparallelism() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.join(source2).where("*").equalTo("*").setParallelism(DEFAULT_PARALLELISM+2)
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);
}

Source File: ReplicatingDataSourceTest.java From flink with Apache License 2.0

5 votes

/**
 * Tests join program with replicated data source behind flatMap.
 */
@Test
public void checkJoinWithReplicatedSourceInputBehindFlatMap() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.flatMap(new IdFlatMap())
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when join should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
	ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}

Source File: JoinCancelingITCase.java From flink with Apache License 2.0

5 votes

private void executeTask(JoinFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> joiner, boolean slow, int parallelism) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<Tuple2<Integer, Integer>> input1 = env.createInput(new InfiniteIntegerTupleInputFormat(slow));
	DataSet<Tuple2<Integer, Integer>> input2 = env.createInput(new InfiniteIntegerTupleInputFormat(slow));

	input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE)
			.where(0)
			.equalTo(0)
			.with(joiner)
			.output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>());

	env.setParallelism(parallelism);

	runAndCancelJob(env.createProgramPlan(), 5 * 1000, 10 * 1000);
}

Source File: ReplicatingDataSourceTest.java From Flink-CEPplus with Apache License 2.0

5 votes

/**
 * Tests join program with replicated data source behind map.
 */
@Test
public void checkJoinWithReplicatedSourceInputBehindMap() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.map(new IdMap())
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when join should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
	ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}

Source File: ReplicatingDataSourceTest.java From Flink-CEPplus with Apache License 2.0

5 votes

/**
 * Tests cross program with replicated data source.
 */
@Test
public void checkCrossWithReplicatedSourceInput() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.cross(source2)
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when cross should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode crossNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType crossIn1 = crossNode.getInput1().getShipStrategy();
	ShipStrategyType crossIn2 = crossNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, crossIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, crossIn2);
}

Source File: ReplicatingDataSourceTest.java From flink with Apache License 2.0

5 votes

/**
 * Tests join program with replicated data source behind map partition.
 */
@Test
public void checkJoinWithReplicatedSourceInputBehindMapPartition() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.mapPartition(new IdPMap())
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when join should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
	ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}

Source File: JoinCancelingITCase.java From flink with Apache License 2.0

5 votes

private void executeTask(JoinFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> joiner, boolean slow, int parallelism) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<Tuple2<Integer, Integer>> input1 = env.createInput(new InfiniteIntegerTupleInputFormat(slow));
	DataSet<Tuple2<Integer, Integer>> input2 = env.createInput(new InfiniteIntegerTupleInputFormat(slow));

	input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE)
			.where(0)
			.equalTo(0)
			.with(joiner)
			.output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>());

	env.setParallelism(parallelism);

	runAndCancelJob(env.createProgramPlan(), 5 * 1000, 10 * 1000);
}

Source File: BatchPojoExample.java From flink-learning with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {

        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        List<CustomCassandraAnnotatedPojo> customCassandraAnnotatedPojos = IntStream.range(0, 20)
                .mapToObj(x -> new CustomCassandraAnnotatedPojo(UUID.randomUUID().toString(), x, 0))
                .collect(Collectors.toList());

        DataSet<CustomCassandraAnnotatedPojo> dataSet = env.fromCollection(customCassandraAnnotatedPojos);

        ClusterBuilder clusterBuilder = new ClusterBuilder() {
            private static final long serialVersionUID = -1754532803757154795L;

            @Override
            protected Cluster buildCluster(Cluster.Builder builder) {
                return builder.addContactPoints("127.0.0.1").build();
            }
        };

        dataSet.output(new CassandraPojoOutputFormat<>(clusterBuilder, CustomCassandraAnnotatedPojo.class, () -> new Mapper.Option[]{Mapper.Option.saveNullFields(true)}));

        env.execute("zhisheng");

        /*
         *	This is for the purpose of showing an example of creating a DataSet using CassandraPojoInputFormat.
         */
        DataSet<CustomCassandraAnnotatedPojo> inputDS = env
                .createInput(new CassandraPojoInputFormat<>(
                        SELECT_QUERY,
                        clusterBuilder,
                        CustomCassandraAnnotatedPojo.class,
                        () -> new Mapper.Option[]{Mapper.Option.consistencyLevel(ConsistencyLevel.ANY)}
                ));

        inputDS.print();
    }

Source File: AvroTypeExtractionTest.java From flink with Apache License 2.0

4 votes

private void testField(final String fieldName) throws Exception {
	before();

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	Path in = new Path(inFile.getAbsoluteFile().toURI());

	AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
	DataSet<User> usersDS = env.createInput(users);

	DataSet<Object> res = usersDS
		.groupBy(fieldName)
		.reduceGroup((GroupReduceFunction<User, Object>) (values, out) -> {
			for (User u : values) {
				out.collect(u.get(fieldName));
			}
		})
		.returns(Object.class);
	res.writeAsText(resultPath);
	env.execute("Simple Avro read job");

	// test if automatic registration of the Types worked
	ExecutionConfig ec = env.getConfig();
	Assert.assertTrue(ec.getRegisteredKryoTypes().contains(Fixed16.class));

	switch (fieldName) {
		case "name":
			expected = "Alyssa\nCharlie";
			break;
		case "type_enum":
			expected = "GREEN\nRED\n";
			break;
		case "type_double_test":
			expected = "123.45\n1.337\n";
			break;
		default:
			Assert.fail("Unknown field");
			break;
	}

	after();
}

Source File: FlinkPravegaInputFormatITCase.java From flink-connectors with Apache License 2.0

4 votes

/**
 * Verifies that the input format:
 *  - correctly reads all records in a given set of multiple Pravega streams
 *  - allows multiple executions
 */
@Test
public void testBatchInput() throws Exception {
    final int numElements1 = 100;
    final int numElements2 = 300;

    // set up the stream
    final String streamName1 = RandomStringUtils.randomAlphabetic(20);
    final String streamName2 = RandomStringUtils.randomAlphabetic(20);

    final Set<String> streams = new HashSet<>();
    streams.add(streamName1);
    streams.add(streamName2);

    SETUP_UTILS.createTestStream(streamName1, 3);
    SETUP_UTILS.createTestStream(streamName2, 5);

    try (
            final EventStreamWriter<Integer> eventWriter1 = SETUP_UTILS.getIntegerWriter(streamName1);
            final EventStreamWriter<Integer> eventWriter2 = SETUP_UTILS.getIntegerWriter(streamName2);

            // create the producer that writes to the stream
            final ThrottledIntegerWriter producer1 = new ThrottledIntegerWriter(
                    eventWriter1,
                    numElements1,
                    numElements1 + 1, // no need to block writer for a batch test
                    0,
                    false
            );

            final ThrottledIntegerWriter producer2 = new ThrottledIntegerWriter(
                    eventWriter2,
                    numElements2,
                    numElements2 + 1, // no need to block writer for a batch test
                    0,
                    false
            )
    ) {
        // write batch input
        producer1.start();
        producer2.start();

        producer1.sync();
        producer2.sync();

        final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(3);

        // simple pipeline that reads from Pravega and collects the events
        DataSet<Integer> integers = env.createInput(
                FlinkPravegaInputFormat.<Integer>builder()
                        .forStream(streamName1)
                        .forStream(streamName2)
                        .withPravegaConfig(SETUP_UTILS.getPravegaConfig())
                        .withDeserializationSchema(new IntegerDeserializationSchema())
                        .build(),
                BasicTypeInfo.INT_TYPE_INFO
        );

        // verify that all events were read
        Assert.assertEquals(numElements1 + numElements2, integers.collect().size());

        // this verifies that the input format allows multiple passes
        Assert.assertEquals(numElements1 + numElements2, integers.collect().size());
    }
}

Source File: TPCHQuery3Parquet.java From parquet-flinktacular with Apache License 2.0

4 votes

private static DataSet<Tuple2<Void, OrderTable>> getOrdersDataSet(ExecutionEnvironment env) throws IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ID;CUSTKEY;ORDERDATE;SHIP_PRIORITY");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		OrderTable.class, job);

	ParquetThriftInputFormat.addInputPath(job, new Path(ordersPath));

	// Filter all Orders with o_orderdate < 12.03.1995
	ParquetThriftInputFormat.setUnboundRecordFilter(job, OrderFilter.class);


	DataSet<Tuple2<Void, OrderTable>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: TPCHQuery3Parquet.java From parquet-flinktacular with Apache License 2.0

4 votes

private static DataSet<Tuple2<Void, LineitemTable>> getLineitemDataSet(ExecutionEnvironment env) throws 
	IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ORDERKEY;EXTENDEDPRICE;DISCOUNT;SHIPDATE");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		LineitemTable.class, job);

	// Filter all Lineitems with l_shipdate > 12.03.1995
	ParquetThriftInputFormat.setUnboundRecordFilter(job, LineitemFilter.class);

	ParquetThriftInputFormat.addInputPath(job, new Path(lineitemPath));

	DataSet<Tuple2<Void, LineitemTable>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: AvroExternalJarProgram.java From Flink-CEPplus with Apache License 2.0

3 votes

public static void main(String[] args) throws Exception {
	String inputPath = args[0];

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<MyUser> input = env.createInput(new AvroInputFormat<MyUser>(new Path(inputPath), MyUser.class));

	DataSet<Tuple2<String, MyUser>> result = input.map(new NameExtractor()).groupBy(0).reduce(new NameGrouper());

	result.output(new DiscardingOutputFormat<Tuple2<String, MyUser>>());
	env.execute();
}

Source File: AvroExternalJarProgram.java From flink with Apache License 2.0

3 votes

public static void main(String[] args) throws Exception {
	String inputPath = args[0];

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<MyUser> input = env.createInput(new AvroInputFormat<MyUser>(new Path(inputPath), MyUser.class));

	DataSet<Tuple2<String, MyUser>> result = input.map(new NameExtractor()).groupBy(0).reduce(new NameGrouper());

	result.output(new DiscardingOutputFormat<Tuple2<String, MyUser>>());
	env.execute();
}

Java Code Examples for org.apache.flink.api.java.ExecutionEnvironment#createInput()