burlap.behavior.valuefunction.QProvider Java Exaples

Source File: VITutorial.java From burlap_examples with MIT License

6 votes

@Override
public GreedyQPolicy planFromState(State initialState) {

	HashableState hashedInitialState = this.hashingFactory.hashState(initialState);
	if(this.valueFunction.containsKey(hashedInitialState)){
		return new GreedyQPolicy(this); //already performed planning here!
	}

	//if the state is new, then find all reachable states from it first
	this.performReachabilityFrom(initialState);

	//now perform multiple iterations over the whole state space
	for(int i = 0; i < this.numIterations; i++){
		//iterate over each state
		for(HashableState sh : this.valueFunction.keySet()){
			//update its value using the bellman equation
			this.valueFunction.put(sh, QProvider.Helper.maxQ(this, sh.s()));
		}
	}

	return new GreedyQPolicy(this);

}

Source File: AnnealedEpsilonGreedy.java From burlap_caffe with Apache License 2.0

5 votes

public AnnealedEpsilonGreedy(QProvider planner, double epsilonStart, double epsilonEnd, int annealingTime) {
    super(planner, epsilonStart);

    this.epsilonStart = epsilonStart;
    this.epsilonEnd = epsilonEnd;
    this.epsilonStep = (epsilonEnd - epsilonStart)/annealingTime;
    this.annealingTime = annealingTime;
}

Source File: BoltzmannPolicyGradient.java From burlap with Apache License 2.0

5 votes

/**
 * Computes the gradient of a Boltzmann policy using the given differentiable valueFunction.
 * @param s the input state of the policy gradient
 * @param a the action whose policy probability gradient being queried
 * @param planner the differentiable {@link DifferentiableQFunction} valueFunction
 * @param beta the Boltzmann beta parameter. This parameter is the inverse of the Botlzmann temperature. As beta becomes larger, the policy becomes more deterministic. Should lie in [0, +ifnty].
 * @return the gradient of the policy.
 */
public static FunctionGradient computeBoltzmannPolicyGradient(State s, Action a, DifferentiableQFunction planner, double beta){


	//get q objects
	List<QValue> Qs = ((QProvider)planner).qValues(s);
	double [] qs = new double[Qs.size()];
	for(int i = 0; i < Qs.size(); i++){
		qs[i] = Qs.get(i).q;
	}

	//find matching action index
	int aind = -1;
	for(int i = 0; i < Qs.size(); i++){
		if(Qs.get(i).a.equals(a)){
			aind = i;
			break;
		}
	}

	if(aind == -1){
		throw new RuntimeException("Error in computing BoltzmannPolicyGradient: Could not find query action in Q-value list.");
	}

	FunctionGradient [] qGradients = new FunctionGradient[qs.length];
	for(int i = 0; i < qs.length; i++){
		qGradients[i] = planner.qGradient(s, Qs.get(i).a);
	}


	FunctionGradient policyGradient = computePolicyGradient(qs, qGradients, aind, beta);

	return policyGradient;

}

Source File: MLIRL.java From burlap with Apache License 2.0

5 votes

/**
 * Computes and returns the gradient of the Boltzmann policy for the given state and action.
 * @param s the state in which the policy is queried
 * @param ga the action for which the policy is queried.
 * @return s the gradient of the Boltzmann policy for the given state and action.
 */
public FunctionGradient logPolicyGrad(State s, Action ga){

	Policy p = new BoltzmannQPolicy((QProvider)this.request.getPlanner(), 1./this.request.getBoltzmannBeta());
	double invActProb = 1./p.actionProb(s, ga);
	FunctionGradient gradient = BoltzmannPolicyGradient.computeBoltzmannPolicyGradient(s, ga, (DifferentiableQFunction)this.request.getPlanner(), this.request.getBoltzmannBeta());

	for(FunctionGradient.PartialDerivative pd : gradient.getNonZeroPartialDerivatives()){
		double newVal = pd.value * invActProb;
		gradient.put(pd.parameterId, newVal);
	}

	return gradient;

}

Source File: MLIRL.java From burlap with Apache License 2.0

5 votes

/**
 * Computes and returns the log-likelihood of the given trajectory under the current reward function parameters and weights it by the given weight.
 * @param ea the trajectory
 * @param weight the weight to assign the trajectory
 * @return the log-likelihood of the given trajectory under the current reward function parameters and weights it by the given weight.
 */
public double logLikelihoodOfTrajectory(Episode ea, double weight){
	double logLike = 0.;
	Policy p = new BoltzmannQPolicy((QProvider)this.request.getPlanner(), 1./this.request.getBoltzmannBeta());
	for(int i = 0; i < ea.numTimeSteps()-1; i++){
		this.request.getPlanner().planFromState(ea.state(i));
		double actProb = p.actionProb(ea.state(i), ea.action(i));
		logLike += Math.log(actProb);
	}
	logLike *= weight;
	return logLike;
}

Source File: GreedyQPolicy.java From burlap with Apache License 2.0

5 votes

@Override
public void setSolver(MDPSolverInterface solver){
	
	if(!(solver instanceof QProvider)){
		throw new RuntimeErrorException(new Error("Planner is not a QComputablePlanner"));
	}
	
	this.qplanner = (QProvider) solver;
}

Source File: EpsilonGreedy.java From burlap with Apache License 2.0

5 votes

@Override
public void setSolver(MDPSolverInterface solver){
	
	if(!(solver instanceof QProvider)){
		throw new RuntimeErrorException(new Error("Planner is not a QComputablePlanner"));
	}
	
	this.qplanner = (QProvider) solver;
}

Source File: GreedyDeterministicQPolicy.java From burlap with Apache License 2.0

5 votes

@Override
public void setSolver(MDPSolverInterface solver){
	
	if(!(solver instanceof QProvider)){
		throw new RuntimeErrorException(new Error("Planner is not a QComputablePlanner"));
	}
	
	this.qplanner = (QProvider) solver;
}

Source File: BoltzmannQPolicy.java From burlap with Apache License 2.0

5 votes

@Override
public void setSolver(MDPSolverInterface solver) {
	if(!(solver instanceof QProvider)){
		throw new RuntimeErrorException(new Error("Planner is not a QComputablePlanner"));
	}
	
	this.qplanner = (QProvider) solver;
	
}

Source File: QMDP.java From burlap with Apache License 2.0

4 votes

/**
 * Initializes.
 * @param domain the POMDP domain
 * @param mdpQSource the underlying fully observable MDP {@link QProvider} source.
 */
public QMDP(PODomain domain, QProvider mdpQSource){
	this.mdpQSource = mdpQSource;
	Planner planner = (Planner)this.mdpQSource;
	this.solverInit(domain, planner.getGamma(), planner.getHashingFactory());
}

Source File: EpsilonGreedy.java From burlap with Apache License 2.0

4 votes

/**
 * Initializes with the QComputablePlanner to use and the value of epsilon to use, where epsilon is the probability of taking a random action.
 * @param planner the QComputablePlanner to use
 * @param epsilon the probability of taking a random action.
 */
public EpsilonGreedy(QProvider planner, double epsilon) {
	qplanner = planner;
	this.epsilon = epsilon;
	rand = RandomFactory.getMapped(0);
}

Source File: GreedyQPolicy.java From burlap with Apache License 2.0

4 votes

/**
 * Initializes with a QComputablePlanner
 * @param planner the QComputablePlanner to use
 */
public GreedyQPolicy(QProvider planner){
	qplanner = planner;
	rand = RandomFactory.getMapped(0);
}

Source File: BasicBehavior.java From burlap_examples with MIT License

4 votes

public void qLearningExample(String outputPath){

		LearningAgent agent = new QLearning(domain, 0.99, hashingFactory, 0., 1.);

		//run learning for 50 episodes
		for(int i = 0; i < 50; i++){
			Episode e = agent.runLearningEpisode(env);

			e.write(outputPath + "ql_" + i);
			System.out.println(i + ": " + e.maxTimeStep());

			//reset environment for next learning episode
			env.resetEnvironment();
		}

		simpleValueFunctionVis((ValueFunction)agent, new GreedyQPolicy((QProvider) agent));

	}

Source File: QLTutorial.java From burlap_examples with MIT License

4 votes

@Override
public double value(State s) {
	return QProvider.Helper.maxQ(this, s);
}

Source File: IRLExample.java From burlap_examples with MIT License

4 votes

/**
 * Runs MLIRL on the trajectories stored in the "irlDemo" directory and then visualizes the learned reward function.
 */
public void runIRL(String pathToEpisodes){

	//create reward function features to use
	LocationFeatures features = new LocationFeatures(this.domain, 5);

	//create a reward function that is linear with respect to those features and has small random
	//parameter values to start
	LinearStateDifferentiableRF rf = new LinearStateDifferentiableRF(features, 5);
	for(int i = 0; i < rf.numParameters(); i++){
		rf.setParameter(i, RandomFactory.getMapped(0).nextDouble()*0.2 - 0.1);
	}

	//load our saved demonstrations from disk
	List<Episode> episodes = Episode.readEpisodes(pathToEpisodes);

	//use either DifferentiableVI or DifferentiableSparseSampling for planning. The latter enables receding horizon IRL,
	//but you will probably want to use a fairly large horizon for this kind of reward function.
	double beta = 10;
	//DifferentiableVI dplanner = new DifferentiableVI(this.domain, rf, 0.99, beta, new SimpleHashableStateFactory(), 0.01, 100);
	DifferentiableSparseSampling dplanner = new DifferentiableSparseSampling(this.domain, rf, 0.99, new SimpleHashableStateFactory(), 10, -1, beta);

	dplanner.toggleDebugPrinting(false);

	//define the IRL problem
	MLIRLRequest request = new MLIRLRequest(domain, dplanner, episodes, rf);
	request.setBoltzmannBeta(beta);

	//run MLIRL on it
	MLIRL irl = new MLIRL(request, 0.1, 0.1, 10);
	irl.performIRL();

	//get all states in the domain so we can visualize the learned reward function for them
	List<State> allStates = StateReachability.getReachableStates(basicState(), this.domain, new SimpleHashableStateFactory());

	//get a standard grid world value function visualizer, but give it StateRewardFunctionValue which returns the
	//reward value received upon reaching each state which will thereby let us render the reward function that is
	//learned rather than the value function for it.
	ValueFunctionVisualizerGUI gui = GridWorldDomain.getGridWorldValueFunctionVisualization(
			allStates,
			5,
			5,
			new RewardValueProjection(rf),
			new GreedyQPolicy((QProvider) request.getPlanner())
	);

	gui.initGUI();


}

Source File: BoltzmannQPolicy.java From burlap with Apache License 2.0

2 votes

/**
 * Initializes with a temperature value and the QComputable valueFunction to use. The temperature value controls how greedy the Boltzmann distribution is.
 * The temperature should be positive with values near zero causing the distribution to be more greedy. A high temperature
 * causes the distribution to be more uniform.
 * @param planner the q-computable valueFunction to use.
 * @param temperature the positive temperature value to use
 */
public BoltzmannQPolicy(QProvider planner, double temperature){
	this.qplanner = planner;
	this.temperature = temperature;
}

Source File: GreedyDeterministicQPolicy.java From burlap with Apache License 2.0

2 votes

/**
 * Initializes with a QComputablePlanner
 * @param qplanner the QComputablePlanner to use
 */
public GreedyDeterministicQPolicy(QProvider qplanner){
	this.qplanner = qplanner;
}

burlap.behavior.valuefunction.QProvider Java Examples