burlap.behavior.valuefunction.QProvider Java Examples
The following examples show how to use
burlap.behavior.valuefunction.QProvider.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: VITutorial.java From burlap_examples with MIT License | 6 votes |
@Override public GreedyQPolicy planFromState(State initialState) { HashableState hashedInitialState = this.hashingFactory.hashState(initialState); if(this.valueFunction.containsKey(hashedInitialState)){ return new GreedyQPolicy(this); //already performed planning here! } //if the state is new, then find all reachable states from it first this.performReachabilityFrom(initialState); //now perform multiple iterations over the whole state space for(int i = 0; i < this.numIterations; i++){ //iterate over each state for(HashableState sh : this.valueFunction.keySet()){ //update its value using the bellman equation this.valueFunction.put(sh, QProvider.Helper.maxQ(this, sh.s())); } } return new GreedyQPolicy(this); }
Example #2
Source File: AnnealedEpsilonGreedy.java From burlap_caffe with Apache License 2.0 | 5 votes |
public AnnealedEpsilonGreedy(QProvider planner, double epsilonStart, double epsilonEnd, int annealingTime) { super(planner, epsilonStart); this.epsilonStart = epsilonStart; this.epsilonEnd = epsilonEnd; this.epsilonStep = (epsilonEnd - epsilonStart)/annealingTime; this.annealingTime = annealingTime; }
Example #3
Source File: BoltzmannPolicyGradient.java From burlap with Apache License 2.0 | 5 votes |
/** * Computes the gradient of a Boltzmann policy using the given differentiable valueFunction. * @param s the input state of the policy gradient * @param a the action whose policy probability gradient being queried * @param planner the differentiable {@link DifferentiableQFunction} valueFunction * @param beta the Boltzmann beta parameter. This parameter is the inverse of the Botlzmann temperature. As beta becomes larger, the policy becomes more deterministic. Should lie in [0, +ifnty]. * @return the gradient of the policy. */ public static FunctionGradient computeBoltzmannPolicyGradient(State s, Action a, DifferentiableQFunction planner, double beta){ //get q objects List<QValue> Qs = ((QProvider)planner).qValues(s); double [] qs = new double[Qs.size()]; for(int i = 0; i < Qs.size(); i++){ qs[i] = Qs.get(i).q; } //find matching action index int aind = -1; for(int i = 0; i < Qs.size(); i++){ if(Qs.get(i).a.equals(a)){ aind = i; break; } } if(aind == -1){ throw new RuntimeException("Error in computing BoltzmannPolicyGradient: Could not find query action in Q-value list."); } FunctionGradient [] qGradients = new FunctionGradient[qs.length]; for(int i = 0; i < qs.length; i++){ qGradients[i] = planner.qGradient(s, Qs.get(i).a); } FunctionGradient policyGradient = computePolicyGradient(qs, qGradients, aind, beta); return policyGradient; }
Example #4
Source File: MLIRL.java From burlap with Apache License 2.0 | 5 votes |
/** * Computes and returns the gradient of the Boltzmann policy for the given state and action. * @param s the state in which the policy is queried * @param ga the action for which the policy is queried. * @return s the gradient of the Boltzmann policy for the given state and action. */ public FunctionGradient logPolicyGrad(State s, Action ga){ Policy p = new BoltzmannQPolicy((QProvider)this.request.getPlanner(), 1./this.request.getBoltzmannBeta()); double invActProb = 1./p.actionProb(s, ga); FunctionGradient gradient = BoltzmannPolicyGradient.computeBoltzmannPolicyGradient(s, ga, (DifferentiableQFunction)this.request.getPlanner(), this.request.getBoltzmannBeta()); for(FunctionGradient.PartialDerivative pd : gradient.getNonZeroPartialDerivatives()){ double newVal = pd.value * invActProb; gradient.put(pd.parameterId, newVal); } return gradient; }
Example #5
Source File: MLIRL.java From burlap with Apache License 2.0 | 5 votes |
/** * Computes and returns the log-likelihood of the given trajectory under the current reward function parameters and weights it by the given weight. * @param ea the trajectory * @param weight the weight to assign the trajectory * @return the log-likelihood of the given trajectory under the current reward function parameters and weights it by the given weight. */ public double logLikelihoodOfTrajectory(Episode ea, double weight){ double logLike = 0.; Policy p = new BoltzmannQPolicy((QProvider)this.request.getPlanner(), 1./this.request.getBoltzmannBeta()); for(int i = 0; i < ea.numTimeSteps()-1; i++){ this.request.getPlanner().planFromState(ea.state(i)); double actProb = p.actionProb(ea.state(i), ea.action(i)); logLike += Math.log(actProb); } logLike *= weight; return logLike; }
Example #6
Source File: GreedyQPolicy.java From burlap with Apache License 2.0 | 5 votes |
@Override public void setSolver(MDPSolverInterface solver){ if(!(solver instanceof QProvider)){ throw new RuntimeErrorException(new Error("Planner is not a QComputablePlanner")); } this.qplanner = (QProvider) solver; }
Example #7
Source File: EpsilonGreedy.java From burlap with Apache License 2.0 | 5 votes |
@Override public void setSolver(MDPSolverInterface solver){ if(!(solver instanceof QProvider)){ throw new RuntimeErrorException(new Error("Planner is not a QComputablePlanner")); } this.qplanner = (QProvider) solver; }
Example #8
Source File: GreedyDeterministicQPolicy.java From burlap with Apache License 2.0 | 5 votes |
@Override public void setSolver(MDPSolverInterface solver){ if(!(solver instanceof QProvider)){ throw new RuntimeErrorException(new Error("Planner is not a QComputablePlanner")); } this.qplanner = (QProvider) solver; }
Example #9
Source File: BoltzmannQPolicy.java From burlap with Apache License 2.0 | 5 votes |
@Override public void setSolver(MDPSolverInterface solver) { if(!(solver instanceof QProvider)){ throw new RuntimeErrorException(new Error("Planner is not a QComputablePlanner")); } this.qplanner = (QProvider) solver; }
Example #10
Source File: QMDP.java From burlap with Apache License 2.0 | 4 votes |
/** * Initializes. * @param domain the POMDP domain * @param mdpQSource the underlying fully observable MDP {@link QProvider} source. */ public QMDP(PODomain domain, QProvider mdpQSource){ this.mdpQSource = mdpQSource; Planner planner = (Planner)this.mdpQSource; this.solverInit(domain, planner.getGamma(), planner.getHashingFactory()); }
Example #11
Source File: EpsilonGreedy.java From burlap with Apache License 2.0 | 4 votes |
/** * Initializes with the QComputablePlanner to use and the value of epsilon to use, where epsilon is the probability of taking a random action. * @param planner the QComputablePlanner to use * @param epsilon the probability of taking a random action. */ public EpsilonGreedy(QProvider planner, double epsilon) { qplanner = planner; this.epsilon = epsilon; rand = RandomFactory.getMapped(0); }
Example #12
Source File: GreedyQPolicy.java From burlap with Apache License 2.0 | 4 votes |
/** * Initializes with a QComputablePlanner * @param planner the QComputablePlanner to use */ public GreedyQPolicy(QProvider planner){ qplanner = planner; rand = RandomFactory.getMapped(0); }
Example #13
Source File: BasicBehavior.java From burlap_examples with MIT License | 4 votes |
public void qLearningExample(String outputPath){ LearningAgent agent = new QLearning(domain, 0.99, hashingFactory, 0., 1.); //run learning for 50 episodes for(int i = 0; i < 50; i++){ Episode e = agent.runLearningEpisode(env); e.write(outputPath + "ql_" + i); System.out.println(i + ": " + e.maxTimeStep()); //reset environment for next learning episode env.resetEnvironment(); } simpleValueFunctionVis((ValueFunction)agent, new GreedyQPolicy((QProvider) agent)); }
Example #14
Source File: QLTutorial.java From burlap_examples with MIT License | 4 votes |
@Override public double value(State s) { return QProvider.Helper.maxQ(this, s); }
Example #15
Source File: IRLExample.java From burlap_examples with MIT License | 4 votes |
/** * Runs MLIRL on the trajectories stored in the "irlDemo" directory and then visualizes the learned reward function. */ public void runIRL(String pathToEpisodes){ //create reward function features to use LocationFeatures features = new LocationFeatures(this.domain, 5); //create a reward function that is linear with respect to those features and has small random //parameter values to start LinearStateDifferentiableRF rf = new LinearStateDifferentiableRF(features, 5); for(int i = 0; i < rf.numParameters(); i++){ rf.setParameter(i, RandomFactory.getMapped(0).nextDouble()*0.2 - 0.1); } //load our saved demonstrations from disk List<Episode> episodes = Episode.readEpisodes(pathToEpisodes); //use either DifferentiableVI or DifferentiableSparseSampling for planning. The latter enables receding horizon IRL, //but you will probably want to use a fairly large horizon for this kind of reward function. double beta = 10; //DifferentiableVI dplanner = new DifferentiableVI(this.domain, rf, 0.99, beta, new SimpleHashableStateFactory(), 0.01, 100); DifferentiableSparseSampling dplanner = new DifferentiableSparseSampling(this.domain, rf, 0.99, new SimpleHashableStateFactory(), 10, -1, beta); dplanner.toggleDebugPrinting(false); //define the IRL problem MLIRLRequest request = new MLIRLRequest(domain, dplanner, episodes, rf); request.setBoltzmannBeta(beta); //run MLIRL on it MLIRL irl = new MLIRL(request, 0.1, 0.1, 10); irl.performIRL(); //get all states in the domain so we can visualize the learned reward function for them List<State> allStates = StateReachability.getReachableStates(basicState(), this.domain, new SimpleHashableStateFactory()); //get a standard grid world value function visualizer, but give it StateRewardFunctionValue which returns the //reward value received upon reaching each state which will thereby let us render the reward function that is //learned rather than the value function for it. ValueFunctionVisualizerGUI gui = GridWorldDomain.getGridWorldValueFunctionVisualization( allStates, 5, 5, new RewardValueProjection(rf), new GreedyQPolicy((QProvider) request.getPlanner()) ); gui.initGUI(); }
Example #16
Source File: BoltzmannQPolicy.java From burlap with Apache License 2.0 | 2 votes |
/** * Initializes with a temperature value and the QComputable valueFunction to use. The temperature value controls how greedy the Boltzmann distribution is. * The temperature should be positive with values near zero causing the distribution to be more greedy. A high temperature * causes the distribution to be more uniform. * @param planner the q-computable valueFunction to use. * @param temperature the positive temperature value to use */ public BoltzmannQPolicy(QProvider planner, double temperature){ this.qplanner = planner; this.temperature = temperature; }
Example #17
Source File: GreedyDeterministicQPolicy.java From burlap with Apache License 2.0 | 2 votes |
/** * Initializes with a QComputablePlanner * @param qplanner the QComputablePlanner to use */ public GreedyDeterministicQPolicy(QProvider qplanner){ this.qplanner = qplanner; }