org.apache.hadoop.filecache.DistributedCache#createSymlink

Source File: TeraSort.java From hadoop-book with Apache License 2.0

6 votes

public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}

Source File: TeraSort.java From hadoop-gpu with Apache License 2.0

6 votes

public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}

Source File: JobControlCompiler.java From spork with Apache License 2.0

5 votes

/**
 * if url is not in HDFS will copy the path to HDFS from local before adding to distributed cache
 * @param pigContext the pigContext
 * @param conf the job conf
 * @param url the url to be added to distributed cache
 * @return the path as seen on distributed cache
 * @throws IOException
 */
@SuppressWarnings("deprecation")
private static void putJarOnClassPathThroughDistributedCache(
        PigContext pigContext,
        Configuration conf,
        URL url) throws IOException {

    // Turn on the symlink feature
    DistributedCache.createSymlink(conf);

    Path distCachePath = getExistingDistCacheFilePath(conf, url);
    if (distCachePath != null) {
        log.info("Jar file " + url + " already in DistributedCache as "
                + distCachePath + ". Not copying to hdfs and adding again");
        // Path already in dist cache
        if (!HadoopShims.isHadoopYARN()) {
            // Mapreduce in YARN includes $PWD/* which will add all *.jar files in classapth.
            // So don't have to ensure that the jar is separately added to mapreduce.job.classpath.files
            // But path may only be in 'mapred.cache.files' and not be in
            // 'mapreduce.job.classpath.files' in Hadoop 1.x. So adding it there
            DistributedCache.addFileToClassPath(distCachePath, conf, distCachePath.getFileSystem(conf));
        }
    }
    else {
        // REGISTER always copies locally the jar file. see PigServer.registerJar()
        Path pathInHDFS = shipToHDFS(pigContext, conf, url);
        DistributedCache.addFileToClassPath(pathInHDFS, conf, FileSystem.get(conf));
        log.info("Added jar " + url + " to DistributedCache through " + pathInHDFS);
    }

}

Source File: TeraSort.java From RDFS with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  long startTime = System.currentTimeMillis();
  JobClient.runJob(job);
  long endTime = System.currentTimeMillis();
  System.out.println((float)(endTime-startTime)/1000);
  LOG.info("done");
  return 0;
}

Source File: Submitter.java From RDFS with Apache License 2.0

4 votes

private static void setupPipesJob(JobConf conf) throws IOException {
  // default map output types to Text
  if (!getIsJavaMapper(conf)) {
    conf.setMapRunnerClass(PipesMapRunner.class);
    // Save the user's partitioner and hook in our's.
    setJavaPartitioner(conf, conf.getPartitionerClass());
    conf.setPartitionerClass(PipesPartitioner.class);
  }
  if (!getIsJavaReducer(conf)) {
    conf.setReducerClass(PipesReducer.class);
    if (!getIsJavaRecordWriter(conf)) {
      conf.setOutputFormat(NullOutputFormat.class);
    }
  }
  String textClassname = Text.class.getName();
  setIfUnset(conf, "mapred.mapoutput.key.class", textClassname);
  setIfUnset(conf, "mapred.mapoutput.value.class", textClassname);
  setIfUnset(conf, "mapred.output.key.class", textClassname);
  setIfUnset(conf, "mapred.output.value.class", textClassname);
  
  // Use PipesNonJavaInputFormat if necessary to handle progress reporting
  // from C++ RecordReaders ...
  if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
    conf.setClass("mapred.pipes.user.inputformat", 
                  conf.getInputFormat().getClass(), InputFormat.class);
    conf.setInputFormat(PipesNonJavaInputFormat.class);
  }
  
  String exec = getExecutable(conf);
  if (exec == null) {
    throw new IllegalArgumentException("No application program defined.");
  }
  // add default debug script only when executable is expressed as
  // <path>#<executable>
  if (exec.contains("#")) {
    DistributedCache.createSymlink(conf);
    // set default gdb commands for map and reduce task 
    String defScript = "$HADOOP_HOME/src/c++/pipes/debug/pipes-default-script";
    setIfUnset(conf,"mapred.map.task.debug.script",defScript);
    setIfUnset(conf,"mapred.reduce.task.debug.script",defScript);
  }
  URI[] fileCache = DistributedCache.getCacheFiles(conf);
  if (fileCache == null) {
    fileCache = new URI[1];
  } else {
    URI[] tmp = new URI[fileCache.length+1];
    System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
    fileCache = tmp;
  }
  try {
    fileCache[0] = new URI(exec);
  } catch (URISyntaxException e) {
    IOException ie = new IOException("Problem parsing execable URI " + exec);
    ie.initCause(e);
    throw ie;
  }
  DistributedCache.setCacheFiles(fileCache, conf);
}

Source File: TestMiniMRMapRedDebugScript.java From RDFS with Apache License 2.0

4 votes

/**
 * Launches failed map task and debugs the failed task
 * @param conf configuration for the mapred job
 * @param inDir input path
 * @param outDir output path
 * @param debugDir debug directory where script is present
 * @param debugCommand The command to execute script
 * @param input Input text
 * @return the output of debug script 
 * @throws IOException
 */
public String launchFailMapAndDebug(JobConf conf,
                                    Path inDir,
                                    Path outDir,
                                    Path debugDir,
                                    String debugScript,
                                    String input)
throws IOException {

  // set up the input file system and write input text.
  FileSystem inFs = inDir.getFileSystem(conf);
  FileSystem outFs = outDir.getFileSystem(conf);
  outFs.delete(outDir, true);
  if (!inFs.mkdirs(inDir)) {
    throw new IOException("Mkdirs failed to create " + inDir.toString());
  }
  {
    // write input into input file
    DataOutputStream file = inFs.create(new Path(inDir, "part-0"));
    file.writeBytes(input);
    file.close();
  }

  // configure the mapred Job for failing map task.
  conf.setJobName("failmap");
  conf.setMapperClass(MapClass.class);        
  conf.setReducerClass(IdentityReducer.class);
  conf.setNumMapTasks(1);
  conf.setNumReduceTasks(0);
  conf.setMapDebugScript(debugScript);
  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  String TEST_ROOT_DIR = new Path(System.getProperty("test.build.data",
                                    "/tmp")).toString().replace(' ', '+');
  conf.set("test.build.data", TEST_ROOT_DIR);

  // copy debug script to cache from local file system.
  FileSystem debugFs = debugDir.getFileSystem(conf);
  Path scriptPath = new Path(debugDir,"testscript.txt");
  Path cachePath = new Path("/cacheDir");
  if (!debugFs.mkdirs(cachePath)) {
    throw new IOException("Mkdirs failed to create " + cachePath.toString());
  }
  debugFs.copyFromLocalFile(scriptPath,cachePath);
  
  URI uri = debugFs.getUri().resolve(cachePath+"/testscript.txt#testscript");
  DistributedCache.createSymlink(conf);
  DistributedCache.addCacheFile(uri, conf);

  RunningJob job =null;
  // run the job. It will fail with IOException.
  try {
    job = new JobClient(conf).submitJob(conf);
  } catch (IOException e) {
  	LOG.info("Running Job failed", e);
  }

  JobID jobId = job.getID();
  // construct the task id of first map task of failmap
  TaskAttemptID taskId = new TaskAttemptID(new TaskID(jobId,true, 0), 0);
  // wait for the job to finish.
  while (!job.isComplete()) ;
  
  // return the output of debugout log.
  return readTaskLog(TaskLog.LogName.DEBUGOUT,taskId, false);
}

Source File: TestTaskLogsMonitor.java From RDFS with Apache License 2.0

4 votes

/**
 * Test the truncation of DEBUGOUT file by {@link TaskLogsMonitor}
 * @throws IOException 
 */
@Test
public void testDebugLogsTruncationWithMiniMR() throws IOException {

  MiniMRCluster mr = null;
  try {
    JobConf clusterConf = new JobConf();
    clusterConf.setLong(TaskTracker.MAP_USERLOG_RETAIN_SIZE, 10000L);
    clusterConf.setLong(TaskTracker.REDUCE_USERLOG_RETAIN_SIZE, 10000L);
    mr = new MiniMRCluster(1, "file:///", 3, null, null, clusterConf);

    JobConf conf = mr.createJobConf();

    Path inDir = new Path(TEST_ROOT_DIR + "/input");
    Path outDir = new Path(TEST_ROOT_DIR + "/output");
    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(outDir)) {
      fs.delete(outDir, true);
    }
    if (!fs.exists(inDir)) {
      fs.mkdirs(inDir);
    }
    String input = "The quick brown fox jumped over the lazy dog";
    DataOutputStream file = fs.create(new Path(inDir, "part-0"));
    file.writeBytes(input);
    file.close();

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, inDir);
    FileOutputFormat.setOutputPath(conf, outDir);
    conf.setNumMapTasks(1);
    conf.setMaxMapAttempts(1);
    conf.setNumReduceTasks(0);
    conf.setMapperClass(TestMiniMRMapRedDebugScript.MapClass.class);

    // copy debug script to cache from local file system.
    Path scriptPath = new Path(TEST_ROOT_DIR, "debug-script.txt");
    String debugScriptContent =
        "for ((i=0;i<1000;i++)); " + "do "
            + "echo \"Lots of logs! Lots of logs! "
            + "Waiting to be truncated! Lots of logs!\";" + "done";
    DataOutputStream scriptFile = fs.create(scriptPath);
    scriptFile.writeBytes(debugScriptContent);
    scriptFile.close();
    new File(scriptPath.toUri().getPath()).setExecutable(true);

    URI uri = scriptPath.toUri();
    DistributedCache.createSymlink(conf);
    DistributedCache.addCacheFile(uri, conf);
    conf.setMapDebugScript(scriptPath.toUri().getPath());

    RunningJob job = null;
    try {
      JobClient jc = new JobClient(conf);
      job = jc.submitJob(conf);
      try {
        jc.monitorAndPrintJob(conf, job);
      } catch (InterruptedException e) {
        //
      }
    } catch (IOException ioe) {
    } finally{
      for (TaskCompletionEvent tce : job.getTaskCompletionEvents(0)) {
        File debugOutFile =
            TaskLog.getTaskLogFile(tce.getTaskAttemptId(),
                TaskLog.LogName.DEBUGOUT);
        if (debugOutFile.exists()) {
          long length = debugOutFile.length();
          assertTrue("DEBUGOUT log file length for "
              + tce.getTaskAttemptId() + " is " + length
              + " and not =10000", length == 10000);
        }
      }
    }
  } finally { 
    if (mr != null) {
      mr.shutdown();
    }
  }
}

Source File: TestMiniMRMapRedDebugScript.java From hadoop-gpu with Apache License 2.0

4 votes

/**
 * Launches failed map task and debugs the failed task
 * @param conf configuration for the mapred job
 * @param inDir input path
 * @param outDir output path
 * @param debugDir debug directory where script is present
 * @param debugCommand The command to execute script
 * @param input Input text
 * @return the output of debug script 
 * @throws IOException
 */
public String launchFailMapAndDebug(JobConf conf,
                                    Path inDir,
                                    Path outDir,
                                    Path debugDir,
                                    String debugScript,
                                    String input)
throws IOException {

  // set up the input file system and write input text.
  FileSystem inFs = inDir.getFileSystem(conf);
  FileSystem outFs = outDir.getFileSystem(conf);
  outFs.delete(outDir, true);
  if (!inFs.mkdirs(inDir)) {
    throw new IOException("Mkdirs failed to create " + inDir.toString());
  }
  {
    // write input into input file
    DataOutputStream file = inFs.create(new Path(inDir, "part-0"));
    file.writeBytes(input);
    file.close();
  }

  // configure the mapred Job for failing map task.
  conf.setJobName("failmap");
  conf.setMapperClass(MapClass.class);        
  conf.setReducerClass(IdentityReducer.class);
  conf.setNumMapTasks(1);
  conf.setNumReduceTasks(0);
  conf.setMapDebugScript(debugScript);
  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  String TEST_ROOT_DIR = new Path(System.getProperty("test.build.data",
                                    "/tmp")).toString().replace(' ', '+');
  conf.set("test.build.data", TEST_ROOT_DIR);

  // copy debug script to cache from local file system.
  FileSystem debugFs = debugDir.getFileSystem(conf);
  Path scriptPath = new Path(debugDir,"testscript.txt");
  Path cachePath = new Path("/cacheDir");
  if (!debugFs.mkdirs(cachePath)) {
    throw new IOException("Mkdirs failed to create " + cachePath.toString());
  }
  debugFs.copyFromLocalFile(scriptPath,cachePath);
  
  URI uri = debugFs.getUri().resolve(cachePath+"/testscript.txt#testscript");
  DistributedCache.createSymlink(conf);
  DistributedCache.addCacheFile(uri, conf);

  RunningJob job =null;
  // run the job. It will fail with IOException.
  try {
    job = new JobClient(conf).submitJob(conf);
  } catch (IOException e) {
  	LOG.info("Running Job failed", e);
  }

  JobID jobId = job.getID();
  // construct the task id of first map task of failmap
  TaskAttemptID taskId = new TaskAttemptID(new TaskID(jobId,true, 0), 0);
  // wait for the job to finish.
  while (!job.isComplete()) ;
  
  // return the output of debugout log.
  return readTaskLog(TaskLog.LogName.DEBUGOUT,taskId, false);
}

Java Code Examples for org.apache.hadoop.filecache.DistributedCache#createSymlink()