burlap.behavior.singleagent.Episode#numTimeSteps

Source File: LSPI.java From burlap with Apache License 2.0

6 votes

@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {

	Episode ea = maxSteps != -1 ? PolicyUtils.rollout(this.learningPolicy, env, maxSteps) : PolicyUtils.rollout(this.learningPolicy, env);

	this.updateDatasetWithLearningEpisode(ea);

	if(this.shouldRereunPolicyIteration(ea)){
		this.runPolicyIteration(this.maxNumPlanningIterations, this.maxChange);
		this.numStepsSinceLastLearningPI = 0;
	}
	else{
		this.numStepsSinceLastLearningPI += ea.numTimeSteps()-1;
	}

	if(episodeHistory.size() >= numEpisodesToStore){
		episodeHistory.poll();
	}
	episodeHistory.offer(ea);

	return ea;
}

Source File: LearningAlgorithmExperimenter.java From burlap with Apache License 2.0

6 votes

/**
 * Runs a trial for an agent generated by the given factor when interpreting trial length as a number of total steps.
 * @param agentFactory the agent factory used to generate the agent to test.
 */
protected void runStepBoundTrial(LearningAgentFactory agentFactory){
	
	//temporarily disable plotter data collection to avoid possible contamination for any actions taken by the agent generation
	//(e.g., if there is pre-test training)
	this.plotter.toggleDataCollection(false);
	
	LearningAgent agent = agentFactory.generateAgent();
	
	this.plotter.toggleDataCollection(true); //turn it back on to begin
	
	this.plotter.startNewTrial();
	
	int stepsRemaining = this.trialLength;
	while(stepsRemaining > 0){
		Episode ea = agent.runLearningEpisode(this.environmentSever, stepsRemaining);
		stepsRemaining -= ea.numTimeSteps()-1; //-1  because we want to subtract the number of actions, not the number of states seen
		this.plotter.endEpisode();
		this.environmentSever.resetEnvironment();
	}
	
	this.plotter.endTrial();
	
}

Source File: PolicyUtils.java From burlap with Apache License 2.0

5 votes

/**
 * Follows the policy in the given {@link burlap.mdp.singleagent.environment.Environment}. The policy will stop being followed once a terminal state
 * in the environment is reached or when the provided number of steps has been taken.
 * @param p the {@link Policy}
 * @param env The {@link burlap.mdp.singleagent.environment.Environment} in which this policy is to be evaluated.
 * @param numSteps the maximum number of steps to take in the environment.
 * @return An {@link Episode} object specifying the interaction with the environment.
 */
public static Episode rollout(Policy p, Environment env, int numSteps){

	Episode ea = new Episode(env.currentObservation());

	int nSteps;
	do{
		followAndRecordPolicy(p, env, ea);
		nSteps = ea.numTimeSteps();
	}while(!env.isInTerminalState() && nSteps < numSteps);

	return ea;
}

Source File: MLIRL.java From burlap with Apache License 2.0

5 votes

/**
 * Computes and returns the log-likelihood of the given trajectory under the current reward function parameters and weights it by the given weight.
 * @param ea the trajectory
 * @param weight the weight to assign the trajectory
 * @return the log-likelihood of the given trajectory under the current reward function parameters and weights it by the given weight.
 */
public double logLikelihoodOfTrajectory(Episode ea, double weight){
	double logLike = 0.;
	Policy p = new BoltzmannQPolicy((QProvider)this.request.getPlanner(), 1./this.request.getBoltzmannBeta());
	for(int i = 0; i < ea.numTimeSteps()-1; i++){
		this.request.getPlanner().planFromState(ea.state(i));
		double actProb = p.actionProb(ea.state(i), ea.action(i));
		logLike += Math.log(actProb);
	}
	logLike *= weight;
	return logLike;
}

Source File: MLIRL.java From burlap with Apache License 2.0

5 votes

/**
 * Computes and returns the gradient of the log-likelihood of all trajectories
 * @return the gradient of the log-likelihood of all trajectories
 */
public FunctionGradient logLikelihoodGradient(){
	HashedAggregator<Integer> gradientSum = new HashedAggregator<Integer>();

	double [] weights = this.request.getEpisodeWeights();
	List<Episode> exampleTrajectories = this.request.getExpertEpisodes();

	for(int i = 0; i < exampleTrajectories.size(); i++){
		Episode ea = exampleTrajectories.get(i);
		double weight = weights[i];
		for(int t = 0; t < ea.numTimeSteps()-1; t++){
			this.request.getPlanner().planFromState(ea.state(t));
			FunctionGradient policyGrad = this.logPolicyGrad(ea.state(t), ea.action(t));
			//weigh it by trajectory strength
			for(FunctionGradient.PartialDerivative pd : policyGrad.getNonZeroPartialDerivatives()){
				double newVal = pd.value * weight;
				gradientSum.add(pd.parameterId, newVal);
			}

		}
	}

	FunctionGradient gradient = new FunctionGradient.SparseGradient(gradientSum.size());
	for(Map.Entry<Integer, Double> e : gradientSum.entrySet()){
		gradient.put(e.getKey(), e.getValue());
	}

	return gradient;
}

Source File: LSPI.java From burlap with Apache License 2.0

5 votes

/**
 * Updates this object's {@link SARSData} to include the results of a learning episode.
 * @param ea the learning episode as an {@link Episode} object.
 */
protected void updateDatasetWithLearningEpisode(Episode ea){
	if(this.dataset == null){
		this.dataset = new SARSData(ea.numTimeSteps()-1);
	}
	for(int i = 0; i < ea.numTimeSteps()-1; i++){
		this.dataset.add(ea.state(i), ea.action(i), ea.reward(i+1), ea.state(i+1));
	}
}

Source File: LSPI.java From burlap with Apache License 2.0

5 votes

/**
 * Returns whether LSPI should be rereun given the latest learning episode results. Default behavior is to return true
 * if the number of leanring episode steps plus the number of steps since the last run is greater than the {@link #numStepsSinceLastLearningPI} threshold.
 * @param ea the most recent learning episode
 * @return true if LSPI should be rerun; false otherwise.
 */
protected boolean shouldRereunPolicyIteration(Episode ea){
	if(this.numStepsSinceLastLearningPI+ea.numTimeSteps()-1 > this.minNewStepsForLearningPI){
		return true;
	}
	return false;
}

Source File: TrainingHelper.java From burlap_caffe with Apache License 2.0

4 votes

public void run() {

        int testCountDown = testInterval;
        int snapshotCountDown = snapshotInterval;

        long trainingStart = System.currentTimeMillis();
        int trainingSteps = 0;
        while (stepCounter < totalTrainingSteps) {
            long epStartTime = 0;
            if (verbose) {
                System.out.println(String.format("Training Episode %d at step %d", episodeCounter, stepCounter));
                epStartTime = System.currentTimeMillis();
            }

            // Set variables needed for training
            prepareForTraining();
            env.resetEnvironment();

            // run learning episode
            Episode ea = learner.runLearningEpisode(env, Math.min(totalTrainingSteps - stepCounter, maxEpisodeSteps));

            // add up episode reward
            double totalReward = 0;
            for (double r : ea.rewardSequence) {
                totalReward += r;
            }

            if (verbose) {
                // output episode data
                long epEndTime = System.currentTimeMillis();
                double timeInterval = (epEndTime - epStartTime)/1000.0;

                System.out.println(String.format("Episode reward: %.2f -- %.1f steps/sec", totalReward, ea.numTimeSteps()/timeInterval));
                System.out.println();
            }

            // take snapshot every snapshotCountDown steps
            stepCounter += ea.numTimeSteps();
            trainingSteps += ea.numTimeSteps();
            episodeCounter++;
            if (snapshotPrefix != null) {
                snapshotCountDown -= ea.numTimeSteps();
                if (snapshotCountDown <= 0) {
                    saveLearningState(snapshotPrefix);
                    snapshotCountDown += snapshotInterval;
                }
            }

            // take test set every testCountDown steps
            testCountDown -= ea.numTimeSteps();
            if (testCountDown <= 0) {
                double trainingTimeInterval = (System.currentTimeMillis() - trainingStart)/1000.0;

                // run test set
                runTestSet();
                testCountDown += testInterval;

                // output training rate
                System.out.printf("Training rate: %.1f steps/sec\n\n",
                        testInterval/trainingTimeInterval);

                // restart training timer
                trainingStart = System.currentTimeMillis();
            }
        }

        if (testOutput != null) {
            testOutput.printf("Final best: %.2f\n", highestAverageReward);
            testOutput.flush();
        }

        System.out.println("Done Training!");
    }

Source File: TrainingHelper.java From burlap_caffe with Apache License 2.0

4 votes

public void runTestSet() {

        long testStart = System.currentTimeMillis();
        int numSteps = 0;
        int numEpisodes = 0;

        // Change any learning variables to test values (i.e. experience memory)
        prepareForTesting();

        // Run the test policy on test episodes
        System.out.println("Running Test Set...");
        double totalTestReward = 0;
        while (true) {
            env.resetEnvironment();
            Episode e = tester.runTestEpisode(env, Math.min(maxEpisodeSteps, totalTestSteps - numSteps));

            double totalReward = 0;
            for (double reward : e.rewardSequence) {
                totalReward += reward;
            }

            if (verbose) {
                System.out.println(String.format("%d: Reward = %.2f, Steps = %d", numEpisodes, totalReward, numSteps));
            }

            numSteps += e.numTimeSteps();
            if (numSteps >= totalTestSteps) {
                if (numEpisodes == 0) {
                    totalTestReward = totalReward;
                    numEpisodes = 1;
                }
                break;
            }

            totalTestReward += totalReward;
            numEpisodes += 1;
        }

        double averageReward = totalTestReward/numEpisodes;
        if (averageReward > highestAverageReward) {
            if (resultsPrefix != null) {
                vfa.snapshot(new File(resultsPrefix, "best_net.caffemodel").toString(),  null);
            }
            highestAverageReward = averageReward;
        }

        double testTimeInterval = (System.currentTimeMillis() - testStart)/1000.0;
        System.out.printf("Average Test Reward: %.2f -- highest: %.2f, Test rate: %.1f\n\n", averageReward, highestAverageReward, numSteps/testTimeInterval);

        if (testOutput != null) {
            testOutput.printf("Frame %d: %.2f\n", stepCounter, averageReward);
            testOutput.flush();
        }
    }

Source File: BeliefSparseSampling.java From burlap with Apache License 2.0

3 votes

public static void main(String [] args){
	TigerDomain tiger = new TigerDomain(true);
	PODomain domain = (PODomain)tiger.generateDomain();
	BeliefState initialBelief = TigerDomain.getInitialBeliefState(domain);

	BeliefSparseSampling bss = new BeliefSparseSampling(domain, 0.99, new ReflectiveHashableStateFactory(), 10, -1);
	Policy p = new GreedyQPolicy(bss);

	SimulatedPOEnvironment env = new SimulatedPOEnvironment(domain);
	env.setCurStateTo(new TigerState(TigerDomain.VAL_LEFT));

	BeliefPolicyAgent agent = new BeliefPolicyAgent(domain, env, p);
	agent.setBeliefState(initialBelief);



	agent.setEnvironment(env);
	
	/*
	State initialBeliefStateOb = BeliefMDPGenerator.getBeliefMDPState(bss.getBeliefMDP(), initialBelief);
	List<QValue> qs = bss.getQs(initialBeliefStateOb);
	for(QValue q : qs){
		System.out.println(q.a.toString() + ": " + q.q);
	}
	*/

	Episode ea = agent.actUntilTerminalOrMaxSteps(30);

	for(int i = 0; i < ea.numTimeSteps()-1; i++){
		System.out.println(ea.action(i) + " " + ea.reward(i+1));
	}
	
	
}

Java Code Examples for burlap.behavior.singleagent.Episode#numTimeSteps()