Java Code Examples for burlap.behavior.policy.PolicyUtils#rollout()
The following examples show how to use
burlap.behavior.policy.PolicyUtils#rollout() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ContinuousDomainTutorial.java From burlap_examples with MIT License | 6 votes |
public static void IPSS(){ InvertedPendulum ip = new InvertedPendulum(); ip.physParams.actionNoise = 0.; RewardFunction rf = new InvertedPendulum.InvertedPendulumRewardFunction(Math.PI/8.); TerminalFunction tf = new InvertedPendulum.InvertedPendulumTerminalFunction(Math.PI/8.); ip.setRf(rf); ip.setTf(tf); SADomain domain = ip.generateDomain(); State initialState = new InvertedPendulumState(); SparseSampling ss = new SparseSampling(domain, 1, new SimpleHashableStateFactory(), 10, 1); ss.setForgetPreviousPlanResults(true); ss.toggleDebugPrinting(false); Policy p = new GreedyQPolicy(ss); Episode e = PolicyUtils.rollout(p, initialState, domain.getModel(), 500); System.out.println("Num steps: " + e.maxTimeStep()); Visualizer v = CartPoleVisualizer.getCartPoleVisualizer(); new EpisodeSequenceVisualizer(v, domain, Arrays.asList(e)); }
Example 2
Source File: LSPI.java From burlap with Apache License 2.0 | 6 votes |
@Override public Episode runLearningEpisode(Environment env, int maxSteps) { Episode ea = maxSteps != -1 ? PolicyUtils.rollout(this.learningPolicy, env, maxSteps) : PolicyUtils.rollout(this.learningPolicy, env); this.updateDatasetWithLearningEpisode(ea); if(this.shouldRereunPolicyIteration(ea)){ this.runPolicyIteration(this.maxNumPlanningIterations, this.maxChange); this.numStepsSinceLastLearningPI = 0; } else{ this.numStepsSinceLastLearningPI += ea.numTimeSteps()-1; } if(episodeHistory.size() >= numEpisodesToStore){ episodeHistory.poll(); } episodeHistory.offer(ea); return ea; }
Example 3
Source File: Main.java From cs7641-assignment4 with MIT License | 5 votes |
/** * Here is where the magic happens. In this method is where I loop through the specific number * of episodes (iterations) and run the specific algorithm. To keep things nice and clean, I use * this method to run all three algorithms. The specific details are specified through the * PlannerFactory interface. * * This method collects all the information from the algorithm and packs it in an Analysis * instance that later gets dumped on the console. */ private static void runAlgorithm(Analysis analysis, Problem problem, SADomain domain, HashableStateFactory hashingFactory, State initialState, PlannerFactory plannerFactory, Algorithm algorithm) { ConstantStateGenerator constantStateGenerator = new ConstantStateGenerator(initialState); SimulatedEnvironment simulatedEnvironment = new SimulatedEnvironment(domain, constantStateGenerator); Planner planner = null; Policy policy = null; for (int episodeIndex = 1; episodeIndex <= problem.getNumberOfIterations(algorithm); episodeIndex++) { long startTime = System.nanoTime(); planner = plannerFactory.createPlanner(episodeIndex, domain, hashingFactory, simulatedEnvironment); policy = planner.planFromState(initialState); /* * If we haven't converged, following the policy will lead the agent wandering around * and it might never reach the goal. To avoid this, we need to set the maximum number * of steps to take before terminating the policy rollout. I decided to set this maximum * at the number of grid locations in our map (width * width). This should give the * agent plenty of room to wander around. * * The smaller this number is, the faster the algorithm will run. */ int maxNumberOfSteps = problem.getWidth() * problem.getWidth(); Episode episode = PolicyUtils.rollout(policy, initialState, domain.getModel(), maxNumberOfSteps); analysis.add(episodeIndex, episode.rewardSequence, episode.numTimeSteps(), (long) (System.nanoTime() - startTime) / 1000000); } if (algorithm == Algorithm.QLearning && USE_LEARNING_EXPERIMENTER) { learningExperimenter(problem, (LearningAgent) planner, simulatedEnvironment); } if (SHOW_VISUALIZATION && planner != null && policy != null) { visualize(problem, (ValueFunction) planner, policy, initialState, domain, hashingFactory, algorithm.getTitle()); } }
Example 4
Source File: VITutorial.java From burlap_examples with MIT License | 5 votes |
public static void main(String [] args){ GridWorldDomain gwd = new GridWorldDomain(11, 11); gwd.setTf(new GridWorldTerminalFunction(10, 10)); gwd.setMapToFourRooms(); //only go in intended directon 80% of the time gwd.setProbSucceedTransitionDynamics(0.8); SADomain domain = gwd.generateDomain(); //get initial state with agent in 0,0 State s = new GridWorldState(new GridAgent(0, 0)); //setup vi with 0.99 discount factor, a value //function initialization that initializes all states to value 0, and which will //run for 30 iterations over the state space VITutorial vi = new VITutorial(domain, 0.99, new SimpleHashableStateFactory(), new ConstantValueFunction(0.0), 30); //run planning from our initial state Policy p = vi.planFromState(s); //evaluate the policy with one roll out visualize the trajectory Episode ea = PolicyUtils.rollout(p, s, domain.getModel()); Visualizer v = GridWorldVisualizer.getVisualizer(gwd.getMap()); new EpisodeSequenceVisualizer(v, domain, Arrays.asList(ea)); }
Example 5
Source File: MCVideo.java From burlap_examples with MIT License | 5 votes |
public static void main(String[] args) { MountainCar mcGen = new MountainCar(); SADomain domain = mcGen.generateDomain(); StateGenerator rStateGen = new MCRandomStateGenerator(mcGen.physParams); SARSCollector collector = new SARSCollector.UniformRandomSARSCollector(domain); SARSData dataset = collector.collectNInstances(rStateGen, domain.getModel(), 5000, 20, null); NormalizedVariableFeatures features = new NormalizedVariableFeatures() .variableDomain("x", new VariableDomain(mcGen.physParams.xmin, mcGen.physParams.xmax)) .variableDomain("v", new VariableDomain(mcGen.physParams.vmin, mcGen.physParams.vmax)); FourierBasis fb = new FourierBasis(features, 4); LSPI lspi = new LSPI(domain, 0.99, new DenseCrossProductFeatures(fb, 3), dataset); Policy p = lspi.runPolicyIteration(30, 1e-6); Visualizer v = MountainCarVisualizer.getVisualizer(mcGen); VisualActionObserver vob = new VisualActionObserver(v); vob.initGUI(); SimulatedEnvironment env = new SimulatedEnvironment(domain, new MCState(mcGen.physParams.valleyPos(), 0)); EnvironmentServer envServ = new EnvironmentServer(env, vob); for(int i = 0; i < 100; i++){ PolicyUtils.rollout(p, envServ); envServ.resetEnvironment(); } System.out.println("Finished"); }
Example 6
Source File: RTDP.java From burlap with Apache License 2.0 | 5 votes |
/** * Performs Bellman updates only after a rollout is complete and in reverse order * @param initialState the initial state from which to plan */ protected void batchRTDP(State initialState){ int totalStates = 0; int consecutiveSmallDeltas = 0; for(int i = 0; i < numRollouts; i++){ Episode ea = PolicyUtils.rollout(rollOutPolicy, initialState, model, maxDepth); LinkedList <HashableState> orderedStates = new LinkedList<HashableState>(); for(State s : ea.stateSequence){ orderedStates.addFirst(this.stateHash(s)); } double delta = this.performOrderedBellmanUpdates(orderedStates); totalStates += orderedStates.size(); DPrint.cl(debugCode, "Pass: " + i + "; Num states: " + orderedStates.size() + " (total: " + totalStates + ")"); if(delta < this.maxDelta){ consecutiveSmallDeltas++; if(consecutiveSmallDeltas >= this.minNumRolloutsWithSmallValueChange){ break; } } else{ consecutiveSmallDeltas = 0; } } }
Example 7
Source File: TestPlanning.java From burlap with Apache License 2.0 | 5 votes |
@Test public void testAStar() { GridWorldState initialState = new GridWorldState(new GridAgent(0, 0), new GridLocation(10, 10, 0, "loc0")); Heuristic mdistHeuristic = new Heuristic() { @Override public double h(State s) { GridAgent agent = ((GridWorldState)s).agent; GridLocation location = ((GridWorldState)s).locations.get(0); //get agent position int ax = agent.x; int ay = agent.y; //get location position int lx = location.x; int ly = location.y; //compute Manhattan distance double mdist = Math.abs(ax-lx) + Math.abs(ay-ly); return -mdist; } }; //provide A* the heuristic as well as the reward function so that it can keep //track of the actual cost DeterministicPlanner planner = new AStar(domain, goalCondition, hashingFactory, mdistHeuristic); planner.planFromState(initialState); Policy p = new SDPlannerPolicy(planner); Episode analysis = PolicyUtils.rollout(p, initialState, domain.getModel()); this.evaluateEpisode(analysis, true); }
Example 8
Source File: SimpleTester.java From burlap_caffe with Apache License 2.0 | 4 votes |
@Override public Episode runTestEpisode(Environment env, int maxSteps) { return PolicyUtils.rollout(policy, env, maxSteps); }
Example 9
Source File: ContinuousDomainTutorial.java From burlap_examples with MIT License | 4 votes |
public static void MCLSPIRBF(){ MountainCar mcGen = new MountainCar(); SADomain domain = mcGen.generateDomain(); MCState s = new MCState(mcGen.physParams.valleyPos(), 0.); NormalizedVariableFeatures inputFeatures = new NormalizedVariableFeatures() .variableDomain("x", new VariableDomain(mcGen.physParams.xmin, mcGen.physParams.xmax)) .variableDomain("v", new VariableDomain(mcGen.physParams.vmin, mcGen.physParams.vmax)); StateGenerator rStateGen = new MCRandomStateGenerator(mcGen.physParams); SARSCollector collector = new SARSCollector.UniformRandomSARSCollector(domain); SARSData dataset = collector.collectNInstances(rStateGen, domain.getModel(), 5000, 20, null); RBFFeatures rbf = new RBFFeatures(inputFeatures, true); FlatStateGridder gridder = new FlatStateGridder() .gridDimension("x", mcGen.physParams.xmin, mcGen.physParams.xmax, 5) .gridDimension("v", mcGen.physParams.vmin, mcGen.physParams.vmax, 5); List<State> griddedStates = gridder.gridState(s); DistanceMetric metric = new EuclideanDistance(); for(State g : griddedStates){ rbf.addRBF(new GaussianRBF(inputFeatures.features(g), metric, 0.2)); } LSPI lspi = new LSPI(domain, 0.99, new DenseCrossProductFeatures(rbf, 3), dataset); Policy p = lspi.runPolicyIteration(30, 1e-6); Visualizer v = MountainCarVisualizer.getVisualizer(mcGen); VisualActionObserver vob = new VisualActionObserver(v); vob.initGUI(); SimulatedEnvironment env = new SimulatedEnvironment(domain, s); env.addObservers(vob); for(int i = 0; i < 5; i++){ PolicyUtils.rollout(p, env); env.resetEnvironment(); } System.out.println("Finished"); }
Example 10
Source File: Episode.java From burlap with Apache License 2.0 | 4 votes |
public static void main(String[] args) { GridWorldDomain gwd = new GridWorldDomain(11, 11); SADomain domain = gwd.generateDomain(); State s = new GridWorldState(new GridAgent(1, 3)); Policy p = new RandomPolicy(domain); Episode ea = PolicyUtils.rollout(p, s, domain.getModel(), 30); String yamlOut = ea.serialize(); System.out.println(yamlOut); System.out.println("\n\n"); Episode read = Episode.parseEpisode(yamlOut); System.out.println(read.actionString()); System.out.println(read.state(0).toString()); System.out.println(read.actionSequence.size()); System.out.println(read.stateSequence.size()); }
Example 11
Source File: TestBlockDude.java From burlap with Apache License 2.0 | 4 votes |
public void testDude(State s) { TerminalFunction tf = new BlockDudeTF(); StateConditionTest sc = new TFGoalCondition(tf); AStar astar = new AStar(domain, sc, new SimpleHashableStateFactory(), new NullHeuristic()); astar.toggleDebugPrinting(false); astar.planFromState(s); Policy p = new SDPlannerPolicy(astar); Episode ea = PolicyUtils.rollout(p, s, domain.getModel(), 100); State lastState = ea.stateSequence.get(ea.stateSequence.size() - 1); Assert.assertEquals(true, tf.isTerminal(lastState)); Assert.assertEquals(true, sc.satisfies(lastState)); Assert.assertEquals(-94.0, ea.discountedReturn(1.0), 0.001); /* BlockDude constructor = new BlockDude(); Domain d = constructor.generateDomain(); List<Integer> px = new ArrayList<Integer>(); List <Integer> ph = new ArrayList<Integer>(); ph.add(15); ph.add(3); ph.add(3); ph.add(3); ph.add(0); ph.add(0); ph.add(0); ph.add(1); ph.add(2); ph.add(0); ph.add(2); ph.add(3); ph.add(2); ph.add(2); ph.add(3); ph.add(3); ph.add(15); State o = BlockDude.getCleanState(d, px, ph, 6); o = BlockDude.setAgent(o, 9, 3, 1, 0); o = BlockDude.setExit(o, 1, 0); o = BlockDude.setBlock(o, 0, 5, 1); o = BlockDude.setBlock(o, 1, 6, 1); o = BlockDude.setBlock(o, 2, 14, 3); o = BlockDude.setBlock(o, 3, 16, 4); o = BlockDude.setBlock(o, 4, 17, 4); o = BlockDude.setBlock(o, 5, 17, 5); TerminalFunction tf = new SinglePFTF(d.getPropFunction(BlockDude.PFATEXIT)); StateConditionTest sc = new SinglePFSCT(d.getPropFunction(BlockDude.PFATEXIT)); RewardFunction rf = new UniformCostRF(); AStar astar = new AStar(d, rf, sc, new DiscreteStateHashFactory(), new NullHeuristic()); astar.toggleDebugPrinting(false); astar.planFromState(o); Policy p = new SDPlannerPolicy(astar); EpisodeAnalysis ea = p.evaluateBehavior(o, rf, tf, 100); State lastState = ea.stateSequence.get(ea.stateSequence.size() - 1); Assert.assertEquals(true, tf.isTerminal(lastState)); Assert.assertEquals(true, sc.satisfies(lastState)); Assert.assertEquals(-94.0, ea.getDiscountedReturn(1.0), 0.001); */ }
Example 12
Source File: ContinuousDomainTutorial.java From burlap_examples with MIT License | 3 votes |
public static void MCLSPIFB(){ MountainCar mcGen = new MountainCar(); SADomain domain = mcGen.generateDomain(); StateGenerator rStateGen = new MCRandomStateGenerator(mcGen.physParams); SARSCollector collector = new SARSCollector.UniformRandomSARSCollector(domain); SARSData dataset = collector.collectNInstances(rStateGen, domain.getModel(), 5000, 20, null); NormalizedVariableFeatures inputFeatures = new NormalizedVariableFeatures() .variableDomain("x", new VariableDomain(mcGen.physParams.xmin, mcGen.physParams.xmax)) .variableDomain("v", new VariableDomain(mcGen.physParams.vmin, mcGen.physParams.vmax)); FourierBasis fb = new FourierBasis(inputFeatures, 4); LSPI lspi = new LSPI(domain, 0.99, new DenseCrossProductFeatures(fb, 3), dataset); Policy p = lspi.runPolicyIteration(30, 1e-6); Visualizer v = MountainCarVisualizer.getVisualizer(mcGen); VisualActionObserver vob = new VisualActionObserver(v); vob.initGUI(); SimulatedEnvironment env = new SimulatedEnvironment(domain, new MCState(mcGen.physParams.valleyPos(), 0.)); env.addObservers(vob); for(int i = 0; i < 5; i++){ PolicyUtils.rollout(p, env); env.resetEnvironment(); } System.out.println("Finished"); }
Example 13
Source File: MinecraftSolver.java From burlapcraft with GNU Lesser General Public License v3.0 | 3 votes |
public static void stocasticPlan(double gamma){ MinecraftDomainGenerator simdg = new MinecraftDomainGenerator(); SADomain domain = simdg.generateDomain(); State initialState = MinecraftStateGeneratorHelper.getCurrentState(BurlapCraft.currentDungeon); Planner planner = new ValueIteration(domain, gamma, new SimpleHashableStateFactory(false), 0.001, 1000); Policy p = planner.planFromState(initialState); MinecraftEnvironment me = new MinecraftEnvironment(); PolicyUtils.rollout(p, me); }