Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileInputFormat#setInputPathFilter()
The following examples show how to use
org.apache.hadoop.mapreduce.lib.input.FileInputFormat#setInputPathFilter() .
Example 1
Source File: From mrgeo with Apache License 2.0 | 5 votes |
public static void setInputInfo(Job job, String inputWithZoom) throws IOException
// job.setInputFormatClass(HdfsMrsPyramidInputFormat.class);
//final String scannedInput = inputs.get(0);
//FileInputFormat.addInputPath(job, new Path(scannedInput));
FileInputFormat.addInputPath(job, new Path(inputWithZoom));
FileInputFormat.setInputPathFilter(job, MapFileFilter.class);
Example 2
Source File: From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception {
Options options = new Options();
try {
parseOptions(options, args);
job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
String job_id = getOptionValue(OPTION_CUBING_JOB_ID);
String cubeName = getOptionValue(OPTION_CUBE_NAME);
Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
Path input = new Path(getOptionValue(OPTION_INPUT_PATH));
//add metadata to distributed cache
CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
CubeInstance cube = cubeMgr.getCube(cubeName);
attachCubeMetadata(cube, job.getConfiguration());
List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
int reducerCount = uhcColumns.size();
//Note! handle uhc columns is null.
boolean hasUHCValue = false;
for (TblColRef tblColRef : uhcColumns) {
Path path = new Path(input.toString() + "/" + tblColRef.getIdentity());
if (HadoopUtil.getFileSystem(path).exists(path)) {
FileInputFormat.addInputPath(job, path);
FileInputFormat.setInputPathFilter(job, UHCDictPathFilter.class);
hasUHCValue = true;
if (!hasUHCValue) {
isSkipped = true;
return 0;
setJobClasspath(job, cube.getConfig());
setupReducer(output, reducerCount);
job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id);
job.getConfiguration().set(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR, KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory());
job.getConfiguration().set(BatchConstants.CFG_MAPRED_OUTPUT_COMPRESS, "false");
//8G memory is enough for all global dict, because the input is sequential and we handle global dict slice by slice
job.getConfiguration().set("mapreduce.reduce.memory.mb", "8500");
job.getConfiguration().set("", "-Xmx8g");
//Copying global dict to working dir in GlobalDictHDFSStore maybe elapsed a long time (Maybe we could improve it)
//Waiting the global dict lock maybe also take a long time.
//So we set 8 hours here
job.getConfiguration().set("mapreduce.task.timeout", "28800000");
//allow user specially set config for uhc step
for (Map.Entry<String, String> entry : cube.getConfig().getUHCMRConfigOverride().entrySet()) {
job.getConfiguration().set(entry.getKey(), entry.getValue());
return waitForCompletion(job);
} finally {
if (job != null)
Example 3
Example 4
Source File: From spork with Apache License 2.0 | 4 votes |
public void setLocation(String location, Job job) throws IOException {
super.setLocation(location, job);
FileInputFormat.setInputPathFilter(job, TestPathFilter.class);
test = true;
Example 5
Source File: From spork with Apache License 2.0 | 4 votes |
public void setLocation(String location, Job job) throws IOException {
FileInputFormat.setInputPaths(job, location);
FileInputFormat.setInputPathFilter(job, JobHistoryPathFilter.class);