burlap.behavior.singleagent.Episode Java Examples
The following examples show how to use
burlap.behavior.singleagent.Episode.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BeliefAgent.java From burlap with Apache License 2.0 | 6 votes |
/** * Causes the agent to act for some fixed number of steps. The agent's belief is automatically * updated by this method using the specified {@link BeliefUpdate}. * The agent's action selection for the current belief state is defined by * the {@link #getAction(burlap.mdp.singleagent.pomdp.beliefstate.BeliefState)} method. The observation, action, and reward * sequence is saved and {@link Episode} object and returned. * @param maxSteps the maximum number of steps to take in the environment * @return and {@link Episode} that recorded the observation, action, and reward sequence. */ public Episode actUntilTerminalOrMaxSteps(int maxSteps){ Episode ea = new Episode(); ea.initializeInState(this.environment.currentObservation()); int c = 0; while(!this.environment.isInTerminalState() && c < maxSteps){ Action ga = this.getAction(this.curBelief); EnvironmentOutcome eo = environment.executeAction(ga); ea.transition(ga, eo.op, eo.r); //update our belief this.curBelief = this.updater.update(this.curBelief, eo.op, eo.a); c++; } return ea; }
Example #2
Source File: LearningAlgorithmExperimenter.java From burlap with Apache License 2.0 | 6 votes |
/** * Runs a trial for an agent generated by the given factor when interpreting trial length as a number of total steps. * @param agentFactory the agent factory used to generate the agent to test. */ protected void runStepBoundTrial(LearningAgentFactory agentFactory){ //temporarily disable plotter data collection to avoid possible contamination for any actions taken by the agent generation //(e.g., if there is pre-test training) this.plotter.toggleDataCollection(false); LearningAgent agent = agentFactory.generateAgent(); this.plotter.toggleDataCollection(true); //turn it back on to begin this.plotter.startNewTrial(); int stepsRemaining = this.trialLength; while(stepsRemaining > 0){ Episode ea = agent.runLearningEpisode(this.environmentSever, stepsRemaining); stepsRemaining -= ea.numTimeSteps()-1; //-1 because we want to subtract the number of actions, not the number of states seen this.plotter.endEpisode(); this.environmentSever.resetEnvironment(); } this.plotter.endTrial(); }
Example #3
Source File: ContinuousDomainTutorial.java From burlap_examples with MIT License | 6 votes |
public static void IPSS(){ InvertedPendulum ip = new InvertedPendulum(); ip.physParams.actionNoise = 0.; RewardFunction rf = new InvertedPendulum.InvertedPendulumRewardFunction(Math.PI/8.); TerminalFunction tf = new InvertedPendulum.InvertedPendulumTerminalFunction(Math.PI/8.); ip.setRf(rf); ip.setTf(tf); SADomain domain = ip.generateDomain(); State initialState = new InvertedPendulumState(); SparseSampling ss = new SparseSampling(domain, 1, new SimpleHashableStateFactory(), 10, 1); ss.setForgetPreviousPlanResults(true); ss.toggleDebugPrinting(false); Policy p = new GreedyQPolicy(ss); Episode e = PolicyUtils.rollout(p, initialState, domain.getModel(), 500); System.out.println("Num steps: " + e.maxTimeStep()); Visualizer v = CartPoleVisualizer.getCartPoleVisualizer(); new EpisodeSequenceVisualizer(v, domain, Arrays.asList(e)); }
Example #4
Source File: LSPI.java From burlap with Apache License 2.0 | 6 votes |
@Override public Episode runLearningEpisode(Environment env, int maxSteps) { Episode ea = maxSteps != -1 ? PolicyUtils.rollout(this.learningPolicy, env, maxSteps) : PolicyUtils.rollout(this.learningPolicy, env); this.updateDatasetWithLearningEpisode(ea); if(this.shouldRereunPolicyIteration(ea)){ this.runPolicyIteration(this.maxNumPlanningIterations, this.maxChange); this.numStepsSinceLastLearningPI = 0; } else{ this.numStepsSinceLastLearningPI += ea.numTimeSteps()-1; } if(episodeHistory.size() >= numEpisodesToStore){ episodeHistory.poll(); } episodeHistory.offer(ea); return ea; }
Example #5
Source File: MLIRL.java From burlap with Apache License 2.0 | 5 votes |
/** * Computes and returns the log-likelihood of the given trajectory under the current reward function parameters and weights it by the given weight. * @param ea the trajectory * @param weight the weight to assign the trajectory * @return the log-likelihood of the given trajectory under the current reward function parameters and weights it by the given weight. */ public double logLikelihoodOfTrajectory(Episode ea, double weight){ double logLike = 0.; Policy p = new BoltzmannQPolicy((QProvider)this.request.getPlanner(), 1./this.request.getBoltzmannBeta()); for(int i = 0; i < ea.numTimeSteps()-1; i++){ this.request.getPlanner().planFromState(ea.state(i)); double actProb = p.actionProb(ea.state(i), ea.action(i)); logLike += Math.log(actProb); } logLike *= weight; return logLike; }
Example #6
Source File: MLIRL.java From burlap with Apache License 2.0 | 5 votes |
/** * Computes and returns the gradient of the log-likelihood of all trajectories * @return the gradient of the log-likelihood of all trajectories */ public FunctionGradient logLikelihoodGradient(){ HashedAggregator<Integer> gradientSum = new HashedAggregator<Integer>(); double [] weights = this.request.getEpisodeWeights(); List<Episode> exampleTrajectories = this.request.getExpertEpisodes(); for(int i = 0; i < exampleTrajectories.size(); i++){ Episode ea = exampleTrajectories.get(i); double weight = weights[i]; for(int t = 0; t < ea.numTimeSteps()-1; t++){ this.request.getPlanner().planFromState(ea.state(t)); FunctionGradient policyGrad = this.logPolicyGrad(ea.state(t), ea.action(t)); //weigh it by trajectory strength for(FunctionGradient.PartialDerivative pd : policyGrad.getNonZeroPartialDerivatives()){ double newVal = pd.value * weight; gradientSum.add(pd.parameterId, newVal); } } } FunctionGradient gradient = new FunctionGradient.SparseGradient(gradientSum.size()); for(Map.Entry<Integer, Double> e : gradientSum.entrySet()){ gradient.put(e.getKey(), e.getValue()); } return gradient; }
Example #7
Source File: MultipleIntentionsMLIRLRequest.java From burlap with Apache License 2.0 | 5 votes |
/** * Initializes * @param domain the domain of the problem * @param plannerFactory A {@link burlap.behavior.singleagent.learnfromdemo.mlirl.support.QGradientPlannerFactory} that produces {@link DifferentiableQFunction} objects. * @param expertEpisodes the expert trajectories * @param rf the {@link burlap.behavior.singleagent.learnfromdemo.mlirl.support.DifferentiableRF} model to use. * @param k the number of clusters */ public MultipleIntentionsMLIRLRequest(SADomain domain, QGradientPlannerFactory plannerFactory, List<Episode> expertEpisodes, DifferentiableRF rf, int k) { super(domain, null, expertEpisodes, rf); this.plannerFactory = plannerFactory; this.k = k; if(this.plannerFactory != null) { this.setPlanner((Planner) plannerFactory.generateDifferentiablePlannerForRequest(this)); } }
Example #8
Source File: TestPlanning.java From burlap with Apache License 2.0 | 5 votes |
@Test public void testBFS() { GridWorldState initialState = new GridWorldState(new GridAgent(0, 0), new GridLocation(10, 10, 0, "loc0")); DeterministicPlanner planner = new BFS(this.domain, this.goalCondition, this.hashingFactory); planner.planFromState(initialState); Policy p = new SDPlannerPolicy(planner); Episode analysis = rollout(p, initialState, domain.getModel()); this.evaluateEpisode(analysis, true); }
Example #9
Source File: MacroAction.java From burlap with Apache License 2.0 | 5 votes |
@Override public double probabilityOfTermination(State s, Episode history) { if(history.actionSequence.size() >= actionSequence.size()){ return 1.; } return 0.; }
Example #10
Source File: TestPlanning.java From burlap with Apache License 2.0 | 5 votes |
public void evaluateEpisode(Episode analysis, Boolean expectOptimal) { if (expectOptimal) { Assert.assertEquals(this.gw.getHeight() + this.gw.getWidth() - 1, analysis.stateSequence.size()); Assert.assertEquals(analysis.stateSequence.size()-1, analysis.actionSequence.size()); Assert.assertEquals(analysis.actionSequence.size(), analysis.rewardSequence.size()); Assert.assertEquals(-analysis.actionSequence.size(), analysis.discountedReturn(1.0), TestPlanning.delta); } Assert.assertEquals(true, domain.getModel().terminal(analysis.stateSequence.get(analysis.stateSequence.size()-1))); Assert.assertEquals(true, this.goalCondition.satisfies(analysis.stateSequence.get(analysis.stateSequence.size()-1))); }
Example #11
Source File: PolicyUtils.java From burlap with Apache License 2.0 | 5 votes |
/** * Follows the policy in the given {@link burlap.mdp.singleagent.environment.Environment}. The policy will stop being followed once a terminal state * in the environment is reached or when the provided number of steps has been taken. * @param p the {@link Policy} * @param env The {@link burlap.mdp.singleagent.environment.Environment} in which this policy is to be evaluated. * @param numSteps the maximum number of steps to take in the environment. * @return An {@link Episode} object specifying the interaction with the environment. */ public static Episode rollout(Policy p, Environment env, int numSteps){ Episode ea = new Episode(env.currentObservation()); int nSteps; do{ followAndRecordPolicy(p, env, ea); nSteps = ea.numTimeSteps(); }while(!env.isInTerminalState() && nSteps < numSteps); return ea; }
Example #12
Source File: TestPlanning.java From burlap with Apache License 2.0 | 5 votes |
@Test public void testAStar() { GridWorldState initialState = new GridWorldState(new GridAgent(0, 0), new GridLocation(10, 10, 0, "loc0")); Heuristic mdistHeuristic = new Heuristic() { @Override public double h(State s) { GridAgent agent = ((GridWorldState)s).agent; GridLocation location = ((GridWorldState)s).locations.get(0); //get agent position int ax = agent.x; int ay = agent.y; //get location position int lx = location.x; int ly = location.y; //compute Manhattan distance double mdist = Math.abs(ax-lx) + Math.abs(ay-ly); return -mdist; } }; //provide A* the heuristic as well as the reward function so that it can keep //track of the actual cost DeterministicPlanner planner = new AStar(domain, goalCondition, hashingFactory, mdistHeuristic); planner.planFromState(initialState); Policy p = new SDPlannerPolicy(planner); Episode analysis = PolicyUtils.rollout(p, initialState, domain.getModel()); this.evaluateEpisode(analysis, true); }
Example #13
Source File: LSPI.java From burlap with Apache License 2.0 | 5 votes |
/** * Updates this object's {@link SARSData} to include the results of a learning episode. * @param ea the learning episode as an {@link Episode} object. */ protected void updateDatasetWithLearningEpisode(Episode ea){ if(this.dataset == null){ this.dataset = new SARSData(ea.numTimeSteps()-1); } for(int i = 0; i < ea.numTimeSteps()-1; i++){ this.dataset.add(ea.state(i), ea.action(i), ea.reward(i+1), ea.state(i+1)); } }
Example #14
Source File: RTDP.java From burlap with Apache License 2.0 | 5 votes |
/** * Performs Bellman updates only after a rollout is complete and in reverse order * @param initialState the initial state from which to plan */ protected void batchRTDP(State initialState){ int totalStates = 0; int consecutiveSmallDeltas = 0; for(int i = 0; i < numRollouts; i++){ Episode ea = PolicyUtils.rollout(rollOutPolicy, initialState, model, maxDepth); LinkedList <HashableState> orderedStates = new LinkedList<HashableState>(); for(State s : ea.stateSequence){ orderedStates.addFirst(this.stateHash(s)); } double delta = this.performOrderedBellmanUpdates(orderedStates); totalStates += orderedStates.size(); DPrint.cl(debugCode, "Pass: " + i + "; Num states: " + orderedStates.size() + " (total: " + totalStates + ")"); if(delta < this.maxDelta){ consecutiveSmallDeltas++; if(consecutiveSmallDeltas >= this.minNumRolloutsWithSmallValueChange){ break; } } else{ consecutiveSmallDeltas = 0; } } }
Example #15
Source File: Option.java From burlap with Apache License 2.0 | 5 votes |
public static EnvironmentOptionOutcome control(Option o, Environment env, double discount){ Random rand = RandomFactory.getMapped(0); State initial = env.currentObservation(); State cur = initial; Episode episode = new Episode(cur); Episode history = new Episode(cur); double roll; double pT; int nsteps = 0; double r = 0.; double cd = 1.; do{ Action a = o.policy(cur, history); EnvironmentOutcome eo = env.executeAction(a); nsteps++; r += cd*eo.r; cur = eo.op; cd *= discount; history.transition(a, eo.op, eo.r); AnnotatedAction annotatedAction = new AnnotatedAction(a, o.toString() + "(" + nsteps + ")"); episode.transition(annotatedAction, eo.op, r); pT = o.probabilityOfTermination(eo.op, history); roll = rand.nextDouble(); }while(roll > pT && !env.isInTerminalState()); EnvironmentOptionOutcome eoo = new EnvironmentOptionOutcome(initial, o, cur, r, env.isInTerminalState(), discount, episode); return eoo; }
Example #16
Source File: QLTutorial.java From burlap_examples with MIT License | 5 votes |
public static void main(String[] args) { GridWorldDomain gwd = new GridWorldDomain(11, 11); gwd.setMapToFourRooms(); gwd.setProbSucceedTransitionDynamics(0.8); gwd.setTf(new GridWorldTerminalFunction(10, 10)); SADomain domain = gwd.generateDomain(); //get initial state with agent in 0,0 State s = new GridWorldState(new GridAgent(0, 0)); //create environment SimulatedEnvironment env = new SimulatedEnvironment(domain, s); //create Q-learning QLTutorial agent = new QLTutorial(domain, 0.99, new SimpleHashableStateFactory(), new ConstantValueFunction(), 0.1, 0.1); //run Q-learning and store results in a list List<Episode> episodes = new ArrayList<Episode>(1000); for(int i = 0; i < 1000; i++){ episodes.add(agent.runLearningEpisode(env)); env.resetEnvironment(); } Visualizer v = GridWorldVisualizer.getVisualizer(gwd.getMap()); new EpisodeSequenceVisualizer(v, domain, episodes); }
Example #17
Source File: QLTutorial.java From burlap_examples with MIT License | 5 votes |
@Override public Episode runLearningEpisode(Environment env, int maxSteps) { //initialize our episode object with the initial state of the environment Episode e = new Episode(env.currentObservation()); //behave until a terminal state or max steps is reached State curState = env.currentObservation(); int steps = 0; while(!env.isInTerminalState() && (steps < maxSteps || maxSteps == -1)){ //select an action Action a = this.learningPolicy.action(curState); //take the action and observe outcome EnvironmentOutcome eo = env.executeAction(a); //record result e.transition(eo); //get the max Q value of the resulting state if it's not terminal, 0 otherwise double maxQ = eo.terminated ? 0. : this.value(eo.op); //update the old Q-value QValue oldQ = this.storedQ(curState, a); oldQ.q = oldQ.q + this.learningRate * (eo.r + this.gamma * maxQ - oldQ.q); //update state pointer to next environment state observed curState = eo.op; steps++; } return e; }
Example #18
Source File: SubgoalOption.java From burlap with Apache License 2.0 | 5 votes |
@Override public double probabilityOfTermination(State s, Episode history) { if(terminationStates.satisfies(s) || !policy.definedFor(s)){ return 1.; } return 0.; }
Example #19
Source File: VITutorial.java From burlap_examples with MIT License | 5 votes |
public static void main(String [] args){ GridWorldDomain gwd = new GridWorldDomain(11, 11); gwd.setTf(new GridWorldTerminalFunction(10, 10)); gwd.setMapToFourRooms(); //only go in intended directon 80% of the time gwd.setProbSucceedTransitionDynamics(0.8); SADomain domain = gwd.generateDomain(); //get initial state with agent in 0,0 State s = new GridWorldState(new GridAgent(0, 0)); //setup vi with 0.99 discount factor, a value //function initialization that initializes all states to value 0, and which will //run for 30 iterations over the state space VITutorial vi = new VITutorial(domain, 0.99, new SimpleHashableStateFactory(), new ConstantValueFunction(0.0), 30); //run planning from our initial state Policy p = vi.planFromState(s); //evaluate the policy with one roll out visualize the trajectory Episode ea = PolicyUtils.rollout(p, s, domain.getModel()); Visualizer v = GridWorldVisualizer.getVisualizer(gwd.getMap()); new EpisodeSequenceVisualizer(v, domain, Arrays.asList(ea)); }
Example #20
Source File: MLIRL.java From burlap with Apache License 2.0 | 5 votes |
/** * Computes and returns the log-likelihood of all expert trajectories under the current reward function parameters. * @return the log-likelihood of all expert trajectories under the current reward function parameters. */ public double logLikelihood(){ double [] weights = this.request.getEpisodeWeights(); List<Episode> exampleTrajectories = this.request.getExpertEpisodes(); double sum = 0.; for(int i = 0; i < exampleTrajectories.size(); i++){ sum += this.logLikelihoodOfTrajectory(exampleTrajectories.get(i), weights[i]); } return sum; }
Example #21
Source File: SubgoalOption.java From burlap with Apache License 2.0 | 5 votes |
@Override public List<ActionProb> policyDistribution(State s, Episode history) { if(!(policy instanceof EnumerablePolicy)){ throw new RuntimeException("SubgoalOption cannot return policy distribution because underlying policy is not an EnumberablePolicy"); } return ((EnumerablePolicy)policy).policyDistribution(s); }
Example #22
Source File: BeliefAgent.java From burlap with Apache License 2.0 | 5 votes |
/** * Causes the agent to act until the environment reaches a termination condition. The agent's belief is automatically * updated by this method using the specified {@link BeliefUpdate}. * The agent's action selection for the current belief state is defined by * the {@link #getAction(burlap.mdp.singleagent.pomdp.beliefstate.BeliefState)} method. The observation, action, and reward * sequence is saved and {@link Episode} object and returned. * @return and {@link Episode} that recorded the observation, action, and reward sequence. */ public Episode actUntilTerminal(){ Episode ea = new Episode(); ea.initializeInState(this.environment.currentObservation()); while(!this.environment.isInTerminalState()){ Action ga = this.getAction(this.curBelief); EnvironmentOutcome eo = environment.executeAction(ga); ea.transition(ga, eo.op, eo.r); //update our belief this.curBelief = this.updater.update(this.curBelief, eo.op, eo.a); } return ea; }
Example #23
Source File: LSPI.java From burlap with Apache License 2.0 | 5 votes |
/** * Returns whether LSPI should be rereun given the latest learning episode results. Default behavior is to return true * if the number of leanring episode steps plus the number of steps since the last run is greater than the {@link #numStepsSinceLastLearningPI} threshold. * @param ea the most recent learning episode * @return true if LSPI should be rerun; false otherwise. */ protected boolean shouldRereunPolicyIteration(Episode ea){ if(this.numStepsSinceLastLearningPI+ea.numTimeSteps()-1 > this.minNewStepsForLearningPI){ return true; } return false; }
Example #24
Source File: TestPlanning.java From burlap with Apache License 2.0 | 5 votes |
@Test public void testDFS() { GridWorldState initialState = new GridWorldState(new GridAgent(0, 0), new GridLocation(10, 10, 0, "loc0")); DeterministicPlanner planner = new DFS(this.domain, this.goalCondition, this.hashingFactory, -1 , true); planner.planFromState(initialState); Policy p = new SDPlannerPolicy(planner); Episode analysis = rollout(p, initialState, domain.getModel()); this.evaluateEpisode(analysis); }
Example #25
Source File: PotentialShapedRMax.java From burlap with Apache License 2.0 | 4 votes |
@Override public Episode runLearningEpisode(Environment env) { return this.runLearningEpisode(env, -1); }
Example #26
Source File: LSPI.java From burlap with Apache License 2.0 | 4 votes |
@Override public Episode runLearningEpisode(Environment env) { return this.runLearningEpisode(env, -1); }
Example #27
Source File: ApproximateQLearning.java From burlap with Apache License 2.0 | 4 votes |
@Override public Episode runLearningEpisode(Environment env) { return this.runLearningEpisode(env, -1); }
Example #28
Source File: ApprenticeshipLearning.java From burlap with Apache License 2.0 | 4 votes |
/** * Returns the initial state of a randomly chosen episode analysis * @param episodes the expert demonstrations * @return a random episode's initial state */ public static State getInitialState(List<Episode> episodes) { Random rando = new Random(); Episode randomEpisode = episodes.get(rando.nextInt(episodes.size())); return randomEpisode.state(0); }
Example #29
Source File: TestBlockDude.java From burlap with Apache License 2.0 | 4 votes |
public void testDude(State s) { TerminalFunction tf = new BlockDudeTF(); StateConditionTest sc = new TFGoalCondition(tf); AStar astar = new AStar(domain, sc, new SimpleHashableStateFactory(), new NullHeuristic()); astar.toggleDebugPrinting(false); astar.planFromState(s); Policy p = new SDPlannerPolicy(astar); Episode ea = PolicyUtils.rollout(p, s, domain.getModel(), 100); State lastState = ea.stateSequence.get(ea.stateSequence.size() - 1); Assert.assertEquals(true, tf.isTerminal(lastState)); Assert.assertEquals(true, sc.satisfies(lastState)); Assert.assertEquals(-94.0, ea.discountedReturn(1.0), 0.001); /* BlockDude constructor = new BlockDude(); Domain d = constructor.generateDomain(); List<Integer> px = new ArrayList<Integer>(); List <Integer> ph = new ArrayList<Integer>(); ph.add(15); ph.add(3); ph.add(3); ph.add(3); ph.add(0); ph.add(0); ph.add(0); ph.add(1); ph.add(2); ph.add(0); ph.add(2); ph.add(3); ph.add(2); ph.add(2); ph.add(3); ph.add(3); ph.add(15); State o = BlockDude.getCleanState(d, px, ph, 6); o = BlockDude.setAgent(o, 9, 3, 1, 0); o = BlockDude.setExit(o, 1, 0); o = BlockDude.setBlock(o, 0, 5, 1); o = BlockDude.setBlock(o, 1, 6, 1); o = BlockDude.setBlock(o, 2, 14, 3); o = BlockDude.setBlock(o, 3, 16, 4); o = BlockDude.setBlock(o, 4, 17, 4); o = BlockDude.setBlock(o, 5, 17, 5); TerminalFunction tf = new SinglePFTF(d.getPropFunction(BlockDude.PFATEXIT)); StateConditionTest sc = new SinglePFSCT(d.getPropFunction(BlockDude.PFATEXIT)); RewardFunction rf = new UniformCostRF(); AStar astar = new AStar(d, rf, sc, new DiscreteStateHashFactory(), new NullHeuristic()); astar.toggleDebugPrinting(false); astar.planFromState(o); Policy p = new SDPlannerPolicy(astar); EpisodeAnalysis ea = p.evaluateBehavior(o, rf, tf, 100); State lastState = ea.stateSequence.get(ea.stateSequence.size() - 1); Assert.assertEquals(true, tf.isTerminal(lastState)); Assert.assertEquals(true, sc.satisfies(lastState)); Assert.assertEquals(-94.0, ea.getDiscountedReturn(1.0), 0.001); */ }
Example #30
Source File: PotentialShapedRMax.java From burlap with Apache License 2.0 | 4 votes |
public List<Episode> getAllStoredLearningEpisodes() { return episodeHistory; }