com.amazonaws.services.elasticmapreduce.util.StepFactory Java Exaples

Source File: emr-add-steps.java From aws-doc-sdk-examples with Apache License 2.0

5 votes

public static void main(String[] args) {
	AWSCredentials credentials_profile = null;		
	try {
		credentials_profile = new ProfileCredentialsProvider("default").getCredentials();
       } catch (Exception e) {
           throw new AmazonClientException(
                   "Cannot load credentials from .aws/credentials file. " +
                   "Make sure that the credentials file exists and the profile name is specified within it.",
                   e);
       }
	
	AmazonElasticMapReduce emr = AmazonElasticMapReduceClientBuilder.standard()
		.withCredentials(new AWSStaticCredentialsProvider(credentials_profile))
		.withRegion(Regions.US_WEST_1)
		.build();
       
	// Run a bash script using a predefined step in the StepFactory helper class
    StepFactory stepFactory = new StepFactory();
    StepConfig runBashScript = new StepConfig()
    		.withName("Run a bash script") 
    		.withHadoopJarStep(stepFactory.newScriptRunnerStep("s3://jeffgoll/emr-scripts/create_users.sh"))
    		.withActionOnFailure("CONTINUE");

    // Run a custom jar file as a step
    HadoopJarStepConfig hadoopConfig1 = new HadoopJarStepConfig()
       .withJar("s3://path/to/my/jarfolder") // replace with the location of the jar to run as a step
       .withMainClass("com.my.Main1") // optional main class, this can be omitted if jar above has a manifest
       .withArgs("--verbose"); // optional list of arguments to pass to the jar
    StepConfig myCustomJarStep = new StepConfig("RunHadoopJar", hadoopConfig1);

    AddJobFlowStepsResult result = emr.addJobFlowSteps(new AddJobFlowStepsRequest()
	  .withJobFlowId("j-xxxxxxxxxxxx") // replace with cluster id to run the steps
	  .withSteps(runBashScript,myCustomJarStep));
    
         System.out.println(result.getStepIds());

}

Source File: EmrPigStepHelper.java From herd with Apache License 2.0

5 votes

@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrPigStep pigStep = (EmrPigStep) step;

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (pigStep.isContinueOnError() != null && pigStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments to hive script
    if (CollectionUtils.isEmpty(pigStep.getScriptArguments()))
    {
        // Just build the StepConfig object and return
        return new StepConfig().withName(pigStep.getStepName().trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new StepFactory().newRunPigScriptStep(pigStep.getScriptLocation().trim()));
    }
    // If there are arguments specified
    else
    {
        return new StepConfig().withName(pigStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(new StepFactory()
            .newRunPigScriptStep(pigStep.getScriptLocation().trim(),
                pigStep.getScriptArguments().toArray(new String[pigStep.getScriptArguments().size()])));
    }
}

Source File: EmrHiveStepHelper.java From herd with Apache License 2.0

5 votes

@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrHiveStep emrHiveStep = (EmrHiveStep) step;

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (emrHiveStep.isContinueOnError() != null && emrHiveStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments to hive script
    if (CollectionUtils.isEmpty(emrHiveStep.getScriptArguments()))
    {
        // Just build the StepConfig object and return
        return new StepConfig().withName(emrHiveStep.getStepName().trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new StepFactory().newRunHiveScriptStep(emrHiveStep.getScriptLocation().trim()));
    }
    // If there are arguments specified
    else
    {
        // For each argument, add "-d" option
        List<String> hiveArgs = new ArrayList<>();
        for (String hiveArg : emrHiveStep.getScriptArguments())
        {
            hiveArgs.add("-d");
            hiveArgs.add(hiveArg);
        }
        // Return the StepConfig object
        return new StepConfig().withName(emrHiveStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(
            new StepFactory().newRunHiveScriptStep(emrHiveStep.getScriptLocation().trim(), hiveArgs.toArray(new String[hiveArgs.size()])));
    }
}

Source File: EmrDaoImpl.java From herd with Apache License 2.0

5 votes

/**
 * Create the step config list of objects for hive/pig installation.
 *
 * @param emrClusterDefinition the EMR definition name value.
 *
 * @return list of step configuration that contains all the steps for the given configuration.
 */
private List<StepConfig> getStepConfig(EmrClusterDefinition emrClusterDefinition)
{
    StepFactory stepFactory = new StepFactory();
    List<StepConfig> appSteps = new ArrayList<>();

    // Create install hive step and add to the StepConfig list
    if (StringUtils.isNotBlank(emrClusterDefinition.getHiveVersion()))
    {
        StepConfig installHive =
            new StepConfig().withName("Hive " + emrClusterDefinition.getHiveVersion()).withActionOnFailure(ActionOnFailure.TERMINATE_JOB_FLOW)
                .withHadoopJarStep(stepFactory.newInstallHiveStep(emrClusterDefinition.getHiveVersion()));
        appSteps.add(installHive);
    }

    // Create install Pig step and add to the StepConfig List
    if (StringUtils.isNotBlank(emrClusterDefinition.getPigVersion()))
    {
        StepConfig installPig =
            new StepConfig().withName("Pig " + emrClusterDefinition.getPigVersion()).withActionOnFailure(ActionOnFailure.TERMINATE_JOB_FLOW)
                .withHadoopJarStep(stepFactory.newInstallPigStep(emrClusterDefinition.getPigVersion()));
        appSteps.add(installPig);
    }

    // Add the hadoop jar steps that need to be added.
    if (!CollectionUtils.isEmpty(emrClusterDefinition.getHadoopJarSteps()))
    {
        for (HadoopJarStep hadoopJarStep : emrClusterDefinition.getHadoopJarSteps())
        {
            StepConfig stepConfig = emrHelper
                .getEmrHadoopJarStepConfig(hadoopJarStep.getStepName(), hadoopJarStep.getJarLocation(), hadoopJarStep.getMainClass(),
                    hadoopJarStep.getScriptArguments(), hadoopJarStep.isContinueOnError());

            appSteps.add(stepConfig);
        }
    }

    return appSteps;
}

Source File: LambdaContainer.java From aws-big-data-blog with Apache License 2.0

5 votes

protected String fireEMRJob(String paramsStr,String clusterId){
	StepFactory stepFactory = new StepFactory();
	AmazonElasticMapReduceClient emr = new AmazonElasticMapReduceClient();
	emr.setRegion(Region.getRegion(Regions.fromName(System.getenv().get("AWS_REGION"))));
	Application sparkConfig = new Application()
			.withName("Spark");
	
	String[] params = paramsStr.split(",");
	StepConfig enabledebugging = new StepConfig()
			.withName("Enable debugging")
			.withActionOnFailure("TERMINATE_JOB_FLOW")
			.withHadoopJarStep(stepFactory.newEnableDebuggingStep());
	
	HadoopJarStepConfig sparkStepConf = new HadoopJarStepConfig()
			.withJar("command-runner.jar")
			.withArgs(params);	
	
	final StepConfig sparkStep = new StepConfig()
			.withName("Spark Step")
			.withActionOnFailure("CONTINUE")
			.withHadoopJarStep(sparkStepConf);

	
	AddJobFlowStepsRequest request = new AddJobFlowStepsRequest(clusterId)
			.withSteps(new ArrayList<StepConfig>(){{add(sparkStep);}});
			

	AddJobFlowStepsResult result = emr.addJobFlowSteps(request);
	return result.getStepIds().get(0);
}

Source File: create_cluster.java From aws-doc-sdk-examples with Apache License 2.0

4 votes

public static void main(String[] args) {
	AWSCredentials credentials_profile = null;		
	try {
		credentials_profile = new ProfileCredentialsProvider("default").getCredentials(); // specifies any named profile in .aws/credentials as the credentials provider
       } catch (Exception e) {
           throw new AmazonClientException(
                   "Cannot load credentials from .aws/credentials file. " +
                   "Make sure that the credentials file exists and that the profile name is defined within it.",
                   e);
       }
	
	// create an EMR client using the credentials and region specified in order to create the cluster
	AmazonElasticMapReduce emr = AmazonElasticMapReduceClientBuilder.standard()
		.withCredentials(new AWSStaticCredentialsProvider(credentials_profile))
		.withRegion(Regions.US_WEST_1)
		.build();
       
       // create a step to enable debugging in the AWS Management Console
	StepFactory stepFactory = new StepFactory(); 
	StepConfig enabledebugging = new StepConfig()
  			.withName("Enable debugging")
  			.withActionOnFailure("TERMINATE_JOB_FLOW")
  			.withHadoopJarStep(stepFactory.newEnableDebuggingStep());
       
       // specify applications to be installed and configured when EMR creates the cluster
	Application hive = new Application().withName("Hive");
	Application spark = new Application().withName("Spark");
	Application ganglia = new Application().withName("Ganglia");
	Application zeppelin = new Application().withName("Zeppelin");
	
	// create the cluster
	RunJobFlowRequest request = new RunJobFlowRequest()
       		.withName("MyClusterCreatedFromJava")
       		.withReleaseLabel("emr-5.20.0") // specifies the EMR release version label, we recommend the latest release
       		.withSteps(enabledebugging)
       		.withApplications(hive,spark,ganglia,zeppelin)
       		.withLogUri("s3://path/to/my/emr/logs") // a URI in S3 for log files is required when debugging is enabled
       		.withServiceRole("EMR_DefaultRole") // replace the default with a custom IAM service role if one is used
       		.withJobFlowRole("EMR_EC2_DefaultRole") // replace the default with a custom EMR role for the EC2 instance profile if one is used
       		.withInstances(new JobFlowInstancesConfig()
       	   		.withEc2SubnetId("subnet-12ab34c56")
           		.withEc2KeyName("myEc2Key") 
           		.withInstanceCount(3) 
           		.withKeepJobFlowAliveWhenNoSteps(true)    
           		.withMasterInstanceType("m4.large")
           		.withSlaveInstanceType("m4.large"));

   RunJobFlowResult result = emr.runJobFlow(request);  
   System.out.println("The cluster ID is " + result.toString());

}

Source File: create-spark-cluster.java From aws-doc-sdk-examples with Apache License 2.0

4 votes

public static void main(String[] args) {
	AWSCredentials credentials_profile = null;		
	try {
		credentials_profile = new ProfileCredentialsProvider("default").getCredentials();
       } catch (Exception e) {
           throw new AmazonClientException(
                   "Cannot load credentials from .aws/credentials file. " +
                   "Make sure that the credentials file exists and the profile name is specified within it.",
                   e);
       }
       
       AmazonElasticMapReduce emr = AmazonElasticMapReduceClientBuilder.standard()
		.withCredentials(new AWSStaticCredentialsProvider(credentials_profile))
		.withRegion(Regions.US_WEST_1)
		.build();
       
       // create a step to enable debugging in the AWS Management Console
	StepFactory stepFactory = new StepFactory(); 
	StepConfig enabledebugging = new StepConfig()
  			.withName("Enable debugging")
  			.withActionOnFailure("TERMINATE_JOB_FLOW")
  			.withHadoopJarStep(stepFactory.newEnableDebuggingStep());
       
       Application spark = new Application().withName("Spark");

       RunJobFlowRequest request = new RunJobFlowRequest()
           .withName("Spark Cluster")
           .withReleaseLabel("emr-5.20.0")
           .withSteps(enabledebugging)
           .withApplications(spark)
           .withLogUri("s3://path/to/my/logs/")
       	.withServiceRole("EMR_DefaultRole") 
       	.withJobFlowRole("EMR_EC2_DefaultRole") 
           .withInstances(new JobFlowInstancesConfig()
               .withEc2SubnetId("subnet-12ab3c45")
               .withEc2KeyName("myEc2Key")
               .withInstanceCount(3)
               .withKeepJobFlowAliveWhenNoSteps(true)
               .withMasterInstanceType("m4.large")
               .withSlaveInstanceType("m4.large")
           );			
       RunJobFlowResult result = emr.runJobFlow(request);  
    System.out.println("The cluster ID is " + result.toString());
}

Source File: EmrOperatorFactory.java From digdag with Apache License 2.0

4 votes

private StepFactory stepFactory()
{
    // TODO: configure region
    return new StepFactory();
}

Source File: EMRUtils.java From aws-big-data-blog with Apache License 2.0

4 votes

/**
 * This method uses method the AWS Java to launch an Apache HBase cluster on Amazon EMR. 
 * 
 * @param client - AmazonElasticMapReduce client that interfaces directly with the Amazon EMR Web Service
 * @param clusterIdentifier - identifier of an existing cluster
 * @param amiVersion - AMI to use for launching this cluster
 * @param keypair - A keypair for SSHing into the Amazon EMR master node
 * @param masterInstanceType - Master node Amazon EC2 instance type 
 * @param coreInstanceType - core nodes Amazon EC2 instance type 
 * @param logUri - An Amazon S3 bucket for your 
 * @param numberOfNodes - total number of nodes in this cluster including master node
 * @return
 */
public static String createCluster(AmazonElasticMapReduce client,
		String clusterIdentifier,
		String amiVersion,
		String keypair,
		String masterInstanceType,
		String coreInstanceType,
		String logUri,
		int numberOfNodes) {

	if (clusterExists(client, clusterIdentifier)) {
		LOG.info("Cluster " + clusterIdentifier + " is available");
		return clusterIdentifier;
	}
	
	//Error checking
	if (amiVersion == null || amiVersion.isEmpty()) throw new RuntimeException("ERROR: Please specify an AMI Version");
	if (keypair == null || keypair.isEmpty()) throw new RuntimeException("ERROR: Please specify a valid Amazon Key Pair");
	if (masterInstanceType == null || masterInstanceType.isEmpty()) throw new RuntimeException("ERROR: Please specify a Master Instance Type");
	if (logUri == null || logUri.isEmpty()) throw new RuntimeException("ERROR: Please specify a valid Amazon S3 bucket for your logs.");
	if (numberOfNodes < 0) throw new RuntimeException("ERROR: Please specify at least 1 node");
	  		
	  RunJobFlowRequest request = new RunJobFlowRequest()
	    .withAmiVersion(amiVersion)
		.withBootstrapActions(new BootstrapActionConfig()
		             .withName("Install HBase")
		             .withScriptBootstrapAction(new ScriptBootstrapActionConfig()
		             .withPath("s3://elasticmapreduce/bootstrap-actions/setup-hbase")))
		.withName("Job Flow With HBAse Actions")	 
		.withSteps(new StepConfig() //enable debugging step
					.withName("Enable debugging")
					.withActionOnFailure("TERMINATE_CLUSTER")
					.withHadoopJarStep(new StepFactory().newEnableDebuggingStep()), 
					//Start HBase step - after installing it with a bootstrap action
					createStepConfig("Start HBase","TERMINATE_CLUSTER", "/home/hadoop/lib/hbase.jar", getHBaseArgs()), 
					//add HBase backup step
					createStepConfig("Modify backup schedule","TERMINATE_JOB_FLOW", "/home/hadoop/lib/hbase.jar", getHBaseBackupArgs()))
		.withLogUri(logUri)
		.withInstances(new JobFlowInstancesConfig()
		.withEc2KeyName(keypair)
		.withInstanceCount(numberOfNodes)
		.withKeepJobFlowAliveWhenNoSteps(true)
		.withMasterInstanceType(masterInstanceType)
		.withSlaveInstanceType(coreInstanceType));

	RunJobFlowResult result = client.runJobFlow(request);
	
	String state = null;
	while (!(state = clusterState(client, result.getJobFlowId())).equalsIgnoreCase("waiting")) {
		try {
			Thread.sleep(10 * 1000);
			LOG.info(result.getJobFlowId() + " is " + state + ". Waiting for cluster to become available.");
		} catch (InterruptedException e) {

		}
		
		if (state.equalsIgnoreCase("TERMINATED_WITH_ERRORS")){
			LOG.error("Could not create EMR Cluster");
			System.exit(-1);	
		}
	}
	LOG.info("Created cluster " + result.getJobFlowId());
	LOG.info("Cluster " + clusterIdentifier + " is available");	
	return result.getJobFlowId();
}

com.amazonaws.services.elasticmapreduce.util.StepFactory Java Examples