burlap.behavior.policy.GreedyQPolicy Java Examples
The following examples show how to use
burlap.behavior.policy.GreedyQPolicy.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: VITutorial.java From burlap_examples with MIT License | 6 votes |
@Override public GreedyQPolicy planFromState(State initialState) { HashableState hashedInitialState = this.hashingFactory.hashState(initialState); if(this.valueFunction.containsKey(hashedInitialState)){ return new GreedyQPolicy(this); //already performed planning here! } //if the state is new, then find all reachable states from it first this.performReachabilityFrom(initialState); //now perform multiple iterations over the whole state space for(int i = 0; i < this.numIterations; i++){ //iterate over each state for(HashableState sh : this.valueFunction.keySet()){ //update its value using the bellman equation this.valueFunction.put(sh, QProvider.Helper.maxQ(this, sh.s())); } } return new GreedyQPolicy(this); }
Example #2
Source File: ContinuousDomainTutorial.java From burlap_examples with MIT License | 6 votes |
public static void IPSS(){ InvertedPendulum ip = new InvertedPendulum(); ip.physParams.actionNoise = 0.; RewardFunction rf = new InvertedPendulum.InvertedPendulumRewardFunction(Math.PI/8.); TerminalFunction tf = new InvertedPendulum.InvertedPendulumTerminalFunction(Math.PI/8.); ip.setRf(rf); ip.setTf(tf); SADomain domain = ip.generateDomain(); State initialState = new InvertedPendulumState(); SparseSampling ss = new SparseSampling(domain, 1, new SimpleHashableStateFactory(), 10, 1); ss.setForgetPreviousPlanResults(true); ss.toggleDebugPrinting(false); Policy p = new GreedyQPolicy(ss); Episode e = PolicyUtils.rollout(p, initialState, domain.getModel(), 500); System.out.println("Num steps: " + e.maxTimeStep()); Visualizer v = CartPoleVisualizer.getCartPoleVisualizer(); new EpisodeSequenceVisualizer(v, domain, Arrays.asList(e)); }
Example #3
Source File: QLearning.java From burlap with Apache License 2.0 | 6 votes |
/** * Plans from the input state and then returns a {@link burlap.behavior.policy.GreedyQPolicy} that greedily * selects the action with the highest Q-value and breaks ties uniformly randomly. * @param initialState the initial state of the planning problem * @return a {@link burlap.behavior.policy.GreedyQPolicy}. */ @Override public GreedyQPolicy planFromState(State initialState) { if(this.model == null){ throw new RuntimeException("QLearning (and its subclasses) cannot execute planFromState because a model is not specified."); } SimulatedEnvironment env = new SimulatedEnvironment(this.domain, initialState); int eCount = 0; do{ this.runLearningEpisode(env, this.maxEpisodeSize); eCount++; }while(eCount < numEpisodesForPlanning && maxQChangeInLastEpisode > maxQChangeForPlanningTermination); return new GreedyQPolicy(this); }
Example #4
Source File: GradientDescentSarsaLam.java From burlap with Apache License 2.0 | 6 votes |
/** * Plans from the input state and then returns a {@link burlap.behavior.policy.GreedyQPolicy} that greedily * selects the action with the highest Q-value and breaks ties uniformly randomly. * @param initialState the initial state of the planning problem * @return a {@link burlap.behavior.policy.GreedyQPolicy}. */ @Override public GreedyQPolicy planFromState(State initialState) { if(this.model == null){ throw new RuntimeException("Planning requires a model, but none is provided."); } SimulatedEnvironment env = new SimulatedEnvironment(domain, initialState); int eCount = 0; do{ this.runLearningEpisode(env); eCount++; }while(eCount < numEpisodesForPlanning && maxWeightChangeInLastEpisode > maxWeightChangeForPlanningTermination); return new GreedyQPolicy(this); }
Example #5
Source File: PolicyIteration.java From burlap with Apache License 2.0 | 6 votes |
/** * Plans from the input state and then returns a {@link burlap.behavior.policy.GreedyQPolicy} that greedily * selects the action with the highest Q-value and breaks ties uniformly randomly. * @param initialState the initial state of the planning problem * @return a {@link burlap.behavior.policy.GreedyQPolicy}. */ @Override public GreedyQPolicy planFromState(State initialState) { int iterations = 0; if(this.performReachabilityFrom(initialState) || !this.hasRunPlanning){ double delta; do{ delta = this.evaluatePolicy(); iterations++; this.evaluativePolicy = new GreedyQPolicy(this.getCopyOfValueFunction()); }while(delta > this.maxPIDelta && iterations < maxPolicyIterations); this.hasRunPlanning = true; } DPrint.cl(this.debugCode, "Total policy iterations: " + iterations); this.totalPolicyIterations += iterations; return (GreedyQPolicy)this.evaluativePolicy; }
Example #6
Source File: LSPI.java From burlap with Apache License 2.0 | 6 votes |
/** * Runs LSPI for either numIterations or until the change in the weight matrix is no greater than maxChange. * @param numIterations the maximum number of policy iterations. * @param maxChange when the weight change is smaller than this value, LSPI terminates. * @return a {@link burlap.behavior.policy.GreedyQPolicy} using this object as the {@link QProvider} source. */ public GreedyQPolicy runPolicyIteration(int numIterations, double maxChange){ boolean converged = false; for(int i = 0; i < numIterations && !converged; i++){ SimpleMatrix nw = this.LSTDQ(); double change = Double.POSITIVE_INFINITY; if(this.lastWeights != null){ change = this.lastWeights.minus(nw).normF(); if(change <= maxChange){ converged = true; } } this.lastWeights = nw; DPrint.cl(0, "Finished iteration: " + i + ". Weight change: " + change); } DPrint.cl(0, "Finished Policy Iteration."); return new GreedyQPolicy(this); }
Example #7
Source File: LSPI.java From burlap with Apache License 2.0 | 6 votes |
/** * Plans from the input state and then returns a {@link burlap.behavior.policy.GreedyQPolicy} that greedily * selects the action with the highest Q-value and breaks ties uniformly randomly. * @param initialState the initial state of the planning problem * @return a {@link burlap.behavior.policy.GreedyQPolicy}. */ @Override public GreedyQPolicy planFromState(State initialState) { if(this.model == null){ throw new RuntimeException("LSPI cannot execute planFromState because the reward function and/or terminal function for planning have not been set. Use the initializeForPlanning method to set them."); } if(planningCollector == null){ this.planningCollector = new SARSCollector.UniformRandomSARSCollector(this.actionTypes); } this.dataset = this.planningCollector.collectNInstances(new ConstantStateGenerator(initialState), this.model, this.numSamplesForPlanning, Integer.MAX_VALUE, this.dataset); return this.runPolicyIteration(this.maxNumPlanningIterations, this.maxChange); }
Example #8
Source File: PolicyIteration.java From burlap with Apache License 2.0 | 5 votes |
/** * Initializes the valueFunction. * @param domain the domain in which to plan * @param gamma the discount factor * @param hashingFactory the state hashing factor to use * @param maxPIDelta when the maximum value value function change between policy iterations is smaller than this value planning will terminate. * @param maxEvalDelta when the maximum change in the value function is smaller than this value, policy evaluation will terminate. * @param maxEvaluationIterations when the number iterations of value iteration used to evaluate a policy exceeds this value, policy evaluation will terminate. * @param maxPolicyIterations when the number of policy iterations passes this value, planning will terminate. */ public PolicyIteration(SADomain domain, double gamma, HashableStateFactory hashingFactory, double maxPIDelta, double maxEvalDelta, int maxEvaluationIterations, int maxPolicyIterations){ this.DPPInit(domain, gamma, hashingFactory); this.maxEvalDelta = maxEvalDelta; this.maxPIDelta = maxPIDelta; this.maxIterations = maxEvaluationIterations; this.maxPolicyIterations = maxPolicyIterations; this.evaluativePolicy = new GreedyQPolicy(this.getCopyOfValueFunction()); }
Example #9
Source File: RTDP.java From burlap with Apache License 2.0 | 5 votes |
/** * Plans from the input state and then returns a {@link burlap.behavior.policy.GreedyQPolicy} that greedily * selects the action with the highest Q-value and breaks ties uniformly randomly. * @param initialState the initial state of the planning problem * @return a {@link burlap.behavior.policy.GreedyQPolicy}. */ @Override public GreedyQPolicy planFromState(State initialState) { if(!useBatch){ this.normalRTDP(initialState); } else{ this.batchRTDP(initialState); } return new GreedyQPolicy(this); }
Example #10
Source File: ValueIteration.java From burlap with Apache License 2.0 | 5 votes |
/** * Plans from the input state and then returns a {@link burlap.behavior.policy.GreedyQPolicy} that greedily * selects the action with the highest Q-value and breaks ties uniformly randomly. * @param initialState the initial state of the planning problem * @return a {@link burlap.behavior.policy.GreedyQPolicy}. */ @Override public GreedyQPolicy planFromState(State initialState){ if(this.performReachabilityFrom(initialState) || !this.hasRunVI){ this.runVI(); } return new GreedyQPolicy(this); }
Example #11
Source File: PolicyIteration.java From burlap with Apache License 2.0 | 5 votes |
/** * Initializes the valueFunction. * @param domain the domain in which to plan * @param gamma the discount factor * @param hashingFactory the state hashing factor to use * @param maxDelta when the maximum change in the value function is smaller than this value, policy evaluation will terminate. Similarly, when the maximum value value function change between policy iterations is smaller than this value planning will terminate. * @param maxEvaluationIterations when the number iterations of value iteration used to evaluate a policy exceeds this value, policy evaluation will terminate. * @param maxPolicyIterations when the number of policy iterations passes this value, planning will terminate. */ public PolicyIteration(SADomain domain, double gamma, HashableStateFactory hashingFactory, double maxDelta, int maxEvaluationIterations, int maxPolicyIterations){ this.DPPInit(domain, gamma, hashingFactory); this.maxEvalDelta = maxDelta; this.maxPIDelta = maxDelta; this.maxIterations = maxEvaluationIterations; this.maxPolicyIterations = maxPolicyIterations; this.evaluativePolicy = new GreedyQPolicy(this.getCopyOfValueFunction()); }
Example #12
Source File: SparseSampling.java From burlap with Apache License 2.0 | 5 votes |
/** * Plans from the input state and then returns a {@link burlap.behavior.policy.GreedyQPolicy} that greedily * selects the action with the highest Q-value and breaks ties uniformly randomly. * @param initialState the initial state of the planning problem * @return a {@link burlap.behavior.policy.GreedyQPolicy}. */ @Override public GreedyQPolicy planFromState(State initialState) { if(this.forgetPreviousPlanResults){ this.rootLevelQValues.clear(); } HashableState sh = this.hashingFactory.hashState(initialState); if(this.rootLevelQValues.containsKey(sh)){ return new GreedyQPolicy(this); //already planned for this state } DPrint.cl(this.debugCode, "Beginning Planning."); int oldUpdates = this.numUpdates; StateNode sn = this.getStateNode(initialState, this.h); rootLevelQValues.put(sh, sn.estimateQs()); DPrint.cl(this.debugCode, "Finished Planning with " + (this.numUpdates - oldUpdates) + " value esitmates; for a cumulative total of: " + this.numUpdates); if(this.forgetPreviousPlanResults){ this.nodesByHeight.clear(); } return new GreedyQPolicy(this); }
Example #13
Source File: IRLExample.java From burlap_examples with MIT License | 4 votes |
/** * Runs MLIRL on the trajectories stored in the "irlDemo" directory and then visualizes the learned reward function. */ public void runIRL(String pathToEpisodes){ //create reward function features to use LocationFeatures features = new LocationFeatures(this.domain, 5); //create a reward function that is linear with respect to those features and has small random //parameter values to start LinearStateDifferentiableRF rf = new LinearStateDifferentiableRF(features, 5); for(int i = 0; i < rf.numParameters(); i++){ rf.setParameter(i, RandomFactory.getMapped(0).nextDouble()*0.2 - 0.1); } //load our saved demonstrations from disk List<Episode> episodes = Episode.readEpisodes(pathToEpisodes); //use either DifferentiableVI or DifferentiableSparseSampling for planning. The latter enables receding horizon IRL, //but you will probably want to use a fairly large horizon for this kind of reward function. double beta = 10; //DifferentiableVI dplanner = new DifferentiableVI(this.domain, rf, 0.99, beta, new SimpleHashableStateFactory(), 0.01, 100); DifferentiableSparseSampling dplanner = new DifferentiableSparseSampling(this.domain, rf, 0.99, new SimpleHashableStateFactory(), 10, -1, beta); dplanner.toggleDebugPrinting(false); //define the IRL problem MLIRLRequest request = new MLIRLRequest(domain, dplanner, episodes, rf); request.setBoltzmannBeta(beta); //run MLIRL on it MLIRL irl = new MLIRL(request, 0.1, 0.1, 10); irl.performIRL(); //get all states in the domain so we can visualize the learned reward function for them List<State> allStates = StateReachability.getReachableStates(basicState(), this.domain, new SimpleHashableStateFactory()); //get a standard grid world value function visualizer, but give it StateRewardFunctionValue which returns the //reward value received upon reaching each state which will thereby let us render the reward function that is //learned rather than the value function for it. ValueFunctionVisualizerGUI gui = GridWorldDomain.getGridWorldValueFunctionVisualization( allStates, 5, 5, new RewardValueProjection(rf), new GreedyQPolicy((QProvider) request.getPlanner()) ); gui.initGUI(); }
Example #14
Source File: BoundedRTDP.java From burlap with Apache License 2.0 | 4 votes |
/** * Plans from the input state and then returns a {@link burlap.behavior.policy.GreedyQPolicy} that greedily * selects the action with the highest Q-value and breaks ties uniformly randomly. * @param initialState the initial state of the planning problem * @return a {@link burlap.behavior.policy.GreedyQPolicy}. */ @Override public GreedyQPolicy planFromState(State initialState) { DPrint.cl(this.debugCode, "Beginning Planning."); int nr = 0; while(this.runRollout(initialState) > this.maxDiff && (nr < this.maxRollouts || this.maxRollouts == -1)){ nr++; } DPrint.cl(this.debugCode, "Finished planning with a total of " + this.numBellmanUpdates + " backups."); return new GreedyQPolicy(this); }
Example #15
Source File: BasicBehavior.java From burlap_examples with MIT License | 4 votes |
public void qLearningExample(String outputPath){ LearningAgent agent = new QLearning(domain, 0.99, hashingFactory, 0., 1.); //run learning for 50 episodes for(int i = 0; i < 50; i++){ Episode e = agent.runLearningEpisode(env); e.write(outputPath + "ql_" + i); System.out.println(i + ": " + e.maxTimeStep()); //reset environment for next learning episode env.resetEnvironment(); } simpleValueFunctionVis((ValueFunction)agent, new GreedyQPolicy((QProvider) agent)); }
Example #16
Source File: BeliefSparseSampling.java From burlap with Apache License 2.0 | 4 votes |
@Override public Policy planFromState(State initialState){ this.mdpPlanner.planFromState(initialState); return new GreedyQPolicy(this); }
Example #17
Source File: QMDP.java From burlap with Apache License 2.0 | 4 votes |
@Override public Policy planFromState(State initialState) { this.forceMDPPlanningFromAllStates(); return new GreedyQPolicy(this); }
Example #18
Source File: VIModelLearningPlanner.java From burlap with Apache License 2.0 | 3 votes |
/** * Initializes * @param domain model domain * @param model the learned model to use for planning * @param gamma discount factor * @param hashingFactory the hashing factory * @param maxDelta max value function delta in VI * @param maxIterations max iterations of VI */ public VIModelLearningPlanner(SADomain domain, FullModel model, double gamma, HashableStateFactory hashingFactory, double maxDelta, int maxIterations){ super(domain, gamma, hashingFactory, maxDelta, maxIterations); this.setModel(model); this.modelPolicy = new ReplanIfUnseenPolicy(new GreedyQPolicy(this)); this.toggleDebugPrinting(false); }
Example #19
Source File: BeliefSparseSampling.java From burlap with Apache License 2.0 | 3 votes |
public static void main(String [] args){ TigerDomain tiger = new TigerDomain(true); PODomain domain = (PODomain)tiger.generateDomain(); BeliefState initialBelief = TigerDomain.getInitialBeliefState(domain); BeliefSparseSampling bss = new BeliefSparseSampling(domain, 0.99, new ReflectiveHashableStateFactory(), 10, -1); Policy p = new GreedyQPolicy(bss); SimulatedPOEnvironment env = new SimulatedPOEnvironment(domain); env.setCurStateTo(new TigerState(TigerDomain.VAL_LEFT)); BeliefPolicyAgent agent = new BeliefPolicyAgent(domain, env, p); agent.setBeliefState(initialBelief); agent.setEnvironment(env); /* State initialBeliefStateOb = BeliefMDPGenerator.getBeliefMDPState(bss.getBeliefMDP(), initialBelief); List<QValue> qs = bss.getQs(initialBeliefStateOb); for(QValue q : qs){ System.out.println(q.a.toString() + ": " + q.q); } */ Episode ea = agent.actUntilTerminalOrMaxSteps(30); for(int i = 0; i < ea.numTimeSteps()-1; i++){ System.out.println(ea.action(i) + " " + ea.reward(i+1)); } }
Example #20
Source File: LSPI.java From burlap with Apache License 2.0 | 2 votes |
/** * Runs LSTDQ on this object's current {@link SARSData} dataset. * @return the new weight matrix as a {@link SimpleMatrix} object. */ public SimpleMatrix LSTDQ(){ //set our policy Policy p = new GreedyQPolicy(this); //first we want to get all the features for all of our states in our data set; this is important if our feature database generates new features on the fly List<SSFeatures> features = new ArrayList<LSPI.SSFeatures>(this.dataset.size()); int nf = 0; for(SARS sars : this.dataset.dataset){ SSFeatures transitionFeatures = new SSFeatures(this.saFeatures.features(sars.s, sars.a), this.saFeatures.features(sars.sp, p.action(sars.sp))); features.add(transitionFeatures); nf = Math.max(nf, transitionFeatures.sActionFeatures.length); } SimpleMatrix B = SimpleMatrix.identity(nf).scale(this.identityScalar); SimpleMatrix b = new SimpleMatrix(nf, 1); for(int i = 0; i < features.size(); i++){ SimpleMatrix phi = this.phiConstructor(features.get(i).sActionFeatures, nf); SimpleMatrix phiPrime = this.phiConstructor(features.get(i).sPrimeActionFeatures, nf); double r = this.dataset.get(i).r; SimpleMatrix numerator = B.mult(phi).mult(phi.minus(phiPrime.scale(gamma)).transpose()).mult(B); SimpleMatrix denomenatorM = phi.minus(phiPrime.scale(this.gamma)).transpose().mult(B).mult(phi); double denomenator = denomenatorM.get(0) + 1; B = B.minus(numerator.scale(1./denomenator)); b = b.plus(phi.scale(r)); //DPrint.cl(0, "updated matrix for row " + i + "/" + features.size()); } SimpleMatrix w = B.mult(b); this.vfa = this.vfa.copy(); for(int i = 0; i < nf; i++){ this.vfa.setParameter(i, w.get(i, 0)); } return w; }
Example #21
Source File: UCT.java From burlap with Apache License 2.0 | 2 votes |
/** * Plans from the input state and then returns a {@link burlap.behavior.policy.GreedyQPolicy} that greedily * selects the action with the highest Q-value and breaks ties uniformly randomly. * @param initialState the initial state of the planning problem * @return a {@link burlap.behavior.policy.GreedyQPolicy}. */ @Override public GreedyQPolicy planFromState(State initialState) { foundGoal = false; treeSize = 1; numVisits = 0; HashableState shi = this.stateHash(initialState); root = stateNodeConstructor.generate(shi, 0, actionTypes, actionNodeConstructor); uniqueStatesInTree = new HashSet<HashableState>(); stateDepthIndex = new ArrayList<Map<HashableState,UCTStateNode>>(); statesToStateNodes = new HashMap<HashableState, List<UCTStateNode>>(); Map <HashableState, UCTStateNode> depth0Map = new HashMap<HashableState, UCTStateNode>(); depth0Map.put(shi, root); stateDepthIndex.add(depth0Map); int lastNumUnique = 0; numRollOutsFromRoot = 0; while(!this.stopPlanning()){ this.initializeRollOut(); this.treeRollOut(root, 0, maxHorizon); numRollOutsFromRoot++; int nu = uniqueStatesInTree.size(); if(nu - lastNumUnique > 0){ DPrint.cl(debugCode, String.valueOf(numRollOutsFromRoot) + "; unique states: " + nu + "; tree size: " + treeSize + "; total visits: " + numVisits); lastNumUnique = nu; } } DPrint.cl(debugCode, "\nRollouts: " + numRollOutsFromRoot + "; Best Action Expected Return: " + this.bestReturnAction(root).averageReturn()); return new GreedyQPolicy(this); }
Example #22
Source File: FittedVI.java From burlap with Apache License 2.0 | 2 votes |
/** * Plans from the input state and then returns a {@link burlap.behavior.policy.GreedyQPolicy} that greedily * selects the action with the highest Q-value and breaks ties uniformly randomly. * @param initialState the initial state of the planning problem * @return a {@link burlap.behavior.policy.GreedyQPolicy}. */ @Override public GreedyQPolicy planFromState(State initialState) { this.runVI(); return new GreedyQPolicy(this); }
Example #23
Source File: RTDP.java From burlap with Apache License 2.0 | 1 votes |
/** * Initializes. The value function will be initialized to vInit by default everywhere and will use a greedy policy with random tie breaks * for performing rollouts. Use the {@link #setValueFunctionInitialization(burlap.behavior.valuefunction.ValueFunction)} method * to change the value function initialization and the {@link #setRollOutPolicy(Policy)} method to change the rollout policy to something else. vInit * should be set to something optimistic like VMax to ensure convergence. * @param domain the domain in which to plan * @param gamma the discount factor * @param hashingFactory the state hashing factor to use * @param vInit the value to the the value function for all states will be initialized * @param numRollouts the number of rollouts to perform when planning is started. * @param maxDelta when the maximum change in the value function from a rollout is smaller than this value, planning will terminate. * @param maxDepth the maximum depth/length of a rollout before it is terminated and Bellman updates are performed. */ public RTDP(SADomain domain, double gamma, HashableStateFactory hashingFactory, double vInit, int numRollouts, double maxDelta, int maxDepth){ this.DPPInit(domain, gamma, hashingFactory); this.numRollouts = numRollouts; this.maxDelta = maxDelta; this.maxDepth = maxDepth; this.rollOutPolicy = new GreedyQPolicy(this); this.valueInitializer = new ConstantValueFunction(vInit); }
Example #24
Source File: RTDP.java From burlap with Apache License 2.0 | 1 votes |
/** * Initializes. The value function will be initialized to vInit by default everywhere and will use a greedy policy with random tie breaks * for performing rollouts. Use the {@link #setValueFunctionInitialization(burlap.behavior.valuefunction.ValueFunction)} method * to change the value function initialization and the {@link #setRollOutPolicy(Policy)} method to change the rollout policy to something else. vInit * should be set to something optimistic like VMax to ensure convergence. * @param domain the domain in which to plan * @param gamma the discount factor * @param hashingFactory the state hashing factor to use * @param vInit the object which defines how the value function will be initialized for each individual state. * @param numRollouts the number of rollouts to perform when planning is started. * @param maxDelta when the maximum change in the value function from a rollout is smaller than this value, planning will terminate. * @param maxDepth the maximum depth/length of a rollout before it is terminated and Bellman updates are performed. */ public RTDP(SADomain domain, double gamma, HashableStateFactory hashingFactory, ValueFunction vInit, int numRollouts, double maxDelta, int maxDepth){ this.DPPInit(domain, gamma, hashingFactory); this.numRollouts = numRollouts; this.maxDelta = maxDelta; this.maxDepth = maxDepth; this.rollOutPolicy = new GreedyQPolicy(this); this.valueInitializer = vInit; }