Java Code Examples for burlap.mdp.singleagent.environment.Environment#executeAction()
The following examples show how to use
burlap.mdp.singleagent.environment.Environment#executeAction() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SARSCollector.java From burlap with Apache License 2.0 | 6 votes |
@Override public SARSData collectDataFrom(Environment env, int maxSteps, SARSData intoDataset) { if(intoDataset == null){ intoDataset = new SARSData(); } int nsteps = 0; while(!env.isInTerminalState() && nsteps < maxSteps){ List<Action> gas = ActionUtils.allApplicableActionsForTypes(this.actionTypes, env.currentObservation()); Action ga = gas.get(RandomFactory.getMapped(0).nextInt(gas.size())); EnvironmentOutcome eo = env.executeAction(ga); intoDataset.add(eo.o, eo.a, eo.r, eo.op); nsteps++; } return intoDataset; }
Example 2
Source File: QLTutorial.java From burlap_examples with MIT License | 5 votes |
@Override public Episode runLearningEpisode(Environment env, int maxSteps) { //initialize our episode object with the initial state of the environment Episode e = new Episode(env.currentObservation()); //behave until a terminal state or max steps is reached State curState = env.currentObservation(); int steps = 0; while(!env.isInTerminalState() && (steps < maxSteps || maxSteps == -1)){ //select an action Action a = this.learningPolicy.action(curState); //take the action and observe outcome EnvironmentOutcome eo = env.executeAction(a); //record result e.transition(eo); //get the max Q value of the resulting state if it's not terminal, 0 otherwise double maxQ = eo.terminated ? 0. : this.value(eo.op); //update the old Q-value QValue oldQ = this.storedQ(curState, a); oldQ.q = oldQ.q + this.learningRate * (eo.r + this.gamma * maxQ - oldQ.q); //update state pointer to next environment state observed curState = eo.op; steps++; } return e; }
Example 3
Source File: Option.java From burlap with Apache License 2.0 | 5 votes |
public static EnvironmentOptionOutcome control(Option o, Environment env, double discount){ Random rand = RandomFactory.getMapped(0); State initial = env.currentObservation(); State cur = initial; Episode episode = new Episode(cur); Episode history = new Episode(cur); double roll; double pT; int nsteps = 0; double r = 0.; double cd = 1.; do{ Action a = o.policy(cur, history); EnvironmentOutcome eo = env.executeAction(a); nsteps++; r += cd*eo.r; cur = eo.op; cd *= discount; history.transition(a, eo.op, eo.r); AnnotatedAction annotatedAction = new AnnotatedAction(a, o.toString() + "(" + nsteps + ")"); episode.transition(annotatedAction, eo.op, r); pT = o.probabilityOfTermination(eo.op, history); roll = rand.nextDouble(); }while(roll > pT && !env.isInTerminalState()); EnvironmentOptionOutcome eoo = new EnvironmentOptionOutcome(initial, o, cur, r, env.isInTerminalState(), discount, episode); return eoo; }
Example 4
Source File: PolicyUtils.java From burlap with Apache License 2.0 | 4 votes |
/** * Follows this policy for one time step in the provided {@link burlap.mdp.singleagent.environment.Environment} and * records the interaction in the provided {@link Episode} object. If the policy * selects an {@link burlap.behavior.singleagent.options.Option}, then how the option's interaction in the environment * is recorded depends on the {@link #rolloutsDecomposeOptions} flag. * If {@link #rolloutsDecomposeOptions} is false, then the option is recorded as a single action. If it is true, then * the individual primitive actions selected by the environment are recorded. * @param p the {@link Policy} * @param env The {@link burlap.mdp.singleagent.environment.Environment} in which this policy should be followed. * @param ea The {@link Episode} object to which the action selection will be recorded. */ protected static void followAndRecordPolicy(Policy p, Environment env, Episode ea){ //follow policy Action a = p.action(env.currentObservation()); if(a == null){ throw new PolicyUndefinedException(); } EnvironmentOutcome eo = env.executeAction(a); if(a instanceof Option && rolloutsDecomposeOptions){ ea.appendAndMergeEpisodeAnalysis(((EnvironmentOptionOutcome)eo).episode); } else{ ea.transition(a, eo.op, eo.r); } }
Example 5
Source File: ApproximateQLearning.java From burlap with Apache License 2.0 | 4 votes |
@Override public Episode runLearningEpisode(Environment env, int maxSteps) { State initialState = env.currentObservation(); Episode e = new Episode(initialState); int eStepCounter = 0; while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){ //check state State curState = stateMapping.mapState(env.currentObservation()); //select action Action a = this.learningPolicy.action(curState); //take action EnvironmentOutcome eo = env.executeAction(a); //save outcome in memory this.memory.addExperience(eo); //record transition and manage option case int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1; eStepCounter += stepInc; this.totalSteps += stepInc; e.transition(a, eo.op, eo.r); //perform learners List<EnvironmentOutcome> samples = this.memory.sampleExperiences(this.numReplay); this.updateQFunction(samples); //update stale function this.stepsSinceStale++; if(this.stepsSinceStale >= this.staleDuration){ this.updateStaleFunction(); } } this.totalEpisodes++; return e; }
Example 6
Source File: ExecuteActionCommand.java From burlap with Apache License 2.0 | 4 votes |
@Override public int call(BurlapShell shell, String argString, Scanner is, PrintStream os) { Environment env = ((EnvironmentShell)shell).getEnv(); OptionSet oset = this.parser.parse(argString.split(" ")); List<String> args = (List<String>)oset.nonOptionArguments(); if(oset.has("h")){ os.println("[v|a] args*\nCommand to execute an action or set an action name alias.\n" + "If -a is not specified, then executes the action with name args[0] with parameters args[1]*\n" + "-v: the resulting reward, termination, and observation from execution is printed.\n" + "-a: assigns an action name alias where args[0] is the original action name, and args[1] is the alias."); return 0; } if(oset.has("a")){ if(args.size() != 2){ return -1; } this.actionNameMap.put(args.get(1), args.get(0)); return 0; } if(args.isEmpty()){ return -1; } ActionType actionType = ((SADomain)this.domain).getAction(args.get(0)); if(actionType == null){ String actionName = this.actionNameMap.get(args.get(0)); if(actionName != null){ actionType = ((SADomain)this.domain).getAction(actionName); } } if(actionType != null){ Action a = actionType.associatedAction(actionArgs(args)); EnvironmentOutcome o = env.executeAction(a); if(oset.has("v")){ os.println("reward: " + o.r); if(o.terminated){ os.println("IS terminal"); } else{ os.println("is NOT terminal"); } os.println(o.op.toString()); } return 1; } return -1; }
Example 7
Source File: DeepQTester.java From burlap_caffe with Apache License 2.0 | 3 votes |
@Override public Episode runTestEpisode(Environment env, int maxSteps) { State initialState = env.currentObservation(); Episode e = new Episode(initialState); int eStepCounter = 0; while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){ //check state State curState = stateMapping.mapState(env.currentObservation()); //select action Action a = this.policy.action(curState); //take action EnvironmentOutcome eo = env.executeAction(a); //save outcome in memory this.memory.addExperience(eo); //record transition and manage option case int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1; eStepCounter += stepInc; e.transition(a, eo.op, eo.r); } return e; }
Example 8
Source File: ActorCritic.java From burlap with Apache License 2.0 | 3 votes |
@Override public Episode runLearningEpisode(Environment env, int maxSteps) { State initialState = env.currentObservation(); Episode ea = new Episode(initialState); State curState = initialState; this.critic.startEpisode(curState); this.actor.startEpisode(curState); int timeSteps = 0; while(!env.isInTerminalState() && (timeSteps < maxSteps || maxSteps == -1)){ Action ga = this.actor.action(curState); EnvironmentOutcome eo = env.executeAction(ga); ea.transition(eo); double critique = this.critic.critique(eo); this.actor.update(eo, critique); curState = env.currentObservation(); timeSteps++; } this.critic.endEpisode(); this.actor.endEpisode(); if(episodeHistory.size() >= numEpisodesToStore){ episodeHistory.poll(); } episodeHistory.offer(ea); return ea; }
Example 9
Source File: ARTDP.java From burlap with Apache License 2.0 | 3 votes |
@Override public Episode runLearningEpisode(Environment env, int maxSteps) { State initialState = env.currentObservation(); Episode ea = new Episode(initialState); State curState = initialState; int steps = 0; while(!env.isInTerminalState() && (steps < maxSteps || maxSteps == -1)){ Action ga = policy.action(curState); EnvironmentOutcome eo = env.executeAction(ga); ea.transition(ga, eo.op, eo.r); this.model.updateModel(eo); this.modelPlanner.performBellmanUpdateOn(eo.o); curState = env.currentObservation(); steps++; } return ea; }
Example 10
Source File: QLearning.java From burlap with Apache License 2.0 | 2 votes |
@Override public Episode runLearningEpisode(Environment env, int maxSteps) { State initialState = env.currentObservation(); Episode ea = new Episode(initialState); HashableState curState = this.stateHash(initialState); eStepCounter = 0; maxQChangeInLastEpisode = 0.; while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){ Action action = learningPolicy.action(curState.s()); QValue curQ = this.getQ(curState, action); EnvironmentOutcome eo; if(!(action instanceof Option)){ eo = env.executeAction(action); } else{ eo = ((Option)action).control(env, this.gamma); } HashableState nextState = this.stateHash(eo.op); double maxQ = 0.; if(!eo.terminated){ maxQ = this.getMaxQ(nextState); } //manage option specifics double r = eo.r; double discount = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).discount : this.gamma; int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1; eStepCounter += stepInc; if(!(action instanceof Option) || !this.shouldDecomposeOptions){ ea.transition(action, nextState.s(), r); } else{ ea.appendAndMergeEpisodeAnalysis(((EnvironmentOptionOutcome)eo).episode); } double oldQ = curQ.q; //update Q-value curQ.q = curQ.q + this.learningRate.pollLearningRate(this.totalNumberOfSteps, curState.s(), action) * (r + (discount * maxQ) - curQ.q); double deltaQ = Math.abs(oldQ - curQ.q); if(deltaQ > maxQChangeInLastEpisode){ maxQChangeInLastEpisode = deltaQ; } //move on polling environment for its current state in case it changed during processing curState = this.stateHash(env.currentObservation()); this.totalNumberOfSteps++; } return ea; }
Example 11
Source File: PotentialShapedRMax.java From burlap with Apache License 2.0 | 2 votes |
@Override public Episode runLearningEpisode(Environment env, int maxSteps) { State initialState = env.currentObservation(); this.modelPlanner.initializePlannerIn(initialState); Episode ea = new Episode(initialState); Policy policy = this.createUnmodeledFavoredPolicy(); State curState = initialState; int steps = 0; while(!env.isInTerminalState() && (steps < maxSteps || maxSteps == -1)){ Action ga = policy.action(curState); EnvironmentOutcome eo = env.executeAction(ga); ea.transition(ga, eo.op, eo.r); boolean modeledTerminal = this.model.terminal(eo.op); if(!this.model.transitionIsModeled(curState, ga) || (!KWIKModel.Helper.stateTransitionsModeled(model, this.getActionTypes(), eo.op) && !modeledTerminal)){ this.model.updateModel(eo); if(this.model.transitionIsModeled(curState, ga) || (eo.terminated != modeledTerminal && modeledTerminal != this.model.terminal(eo.op))){ this.modelPlanner.modelChanged(curState); policy = this.createUnmodeledFavoredPolicy(); } } curState = env.currentObservation(); steps++; } if(episodeHistory.size() >= numEpisodesToStore){ episodeHistory.poll(); } episodeHistory.offer(ea); return ea; }