burlap.behavior.singleagent.options.EnvironmentOptionOutcome Java Examples
The following examples show how to use
burlap.behavior.singleagent.options.EnvironmentOptionOutcome.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DynamicWeightedAStar.java From burlap with Apache License 2.0 | 6 votes |
public double computeF(PrioritizedSearchNode parentNode, Action generatingAction, HashableState successorState, EnvironmentOutcome eo) { double cumR = 0.; int d = 0; if(parentNode != null){ double pCumR = cumulatedRewardMap.get(parentNode.s); cumR = pCumR + eo.r; int pD = depthMap.get(parentNode.s); if(!(generatingAction instanceof Option)){ d = pD + 1; } else{ d = pD + ((EnvironmentOptionOutcome)eo).numSteps(); } } double H = heuristic.h(successorState.s()); lastComputedCumR = cumR; lastComputedDepth = d; double weightedE = this.epsilon * this.epsilonWeight(d); double F = cumR + ((1. + weightedE)*H); return F; }
Example #2
Source File: SparseSampling.java From burlap with Apache License 2.0 | 5 votes |
/** * Estimates the Q-value using sampling from the transition dynamics. This is the standard Sparse Sampling procedure. * @param ga the action for which the Q-value estimate is to be returned * @return the Q-value estimate */ protected double sampledQEstimate(Action ga){ double sum = 0.; //generate C samples int c = SparseSampling.this.getCAtHeight(this.height); for(int i = 0; i < c; i++){ //execute EnvironmentOutcome eo = model.sample(sh.s(), ga); State ns = eo.op; //manage option stepsize modifications int k = 1; if(ga instanceof Option){ k = ((EnvironmentOptionOutcome)ga).numSteps(); } //get reward; our rf will automatically do cumumative discounted if it's an option double r = eo.r; StateNode nsn = SparseSampling.this.getStateNode(ns, this.height-k); sum += r + Math.pow(SparseSampling.this.gamma, k)*nsn.estimateV(); } sum /= (double)c; return sum; }
Example #3
Source File: OptionsExample.java From burlap_examples with MIT License | 4 votes |
public static Episode optionExecuteResult(SADomain domain, Option o, State s){ SimulatedEnvironment env = new SimulatedEnvironment(domain, s); EnvironmentOptionOutcome eo = o.control(env, 0.99); return eo.episode; }
Example #4
Source File: PolicyUtils.java From burlap with Apache License 2.0 | 4 votes |
/** * Follows this policy for one time step in the provided {@link burlap.mdp.singleagent.environment.Environment} and * records the interaction in the provided {@link Episode} object. If the policy * selects an {@link burlap.behavior.singleagent.options.Option}, then how the option's interaction in the environment * is recorded depends on the {@link #rolloutsDecomposeOptions} flag. * If {@link #rolloutsDecomposeOptions} is false, then the option is recorded as a single action. If it is true, then * the individual primitive actions selected by the environment are recorded. * @param p the {@link Policy} * @param env The {@link burlap.mdp.singleagent.environment.Environment} in which this policy should be followed. * @param ea The {@link Episode} object to which the action selection will be recorded. */ protected static void followAndRecordPolicy(Policy p, Environment env, Episode ea){ //follow policy Action a = p.action(env.currentObservation()); if(a == null){ throw new PolicyUndefinedException(); } EnvironmentOutcome eo = env.executeAction(a); if(a instanceof Option && rolloutsDecomposeOptions){ ea.appendAndMergeEpisodeAnalysis(((EnvironmentOptionOutcome)eo).episode); } else{ ea.transition(a, eo.op, eo.r); } }
Example #5
Source File: ApproximateQLearning.java From burlap with Apache License 2.0 | 4 votes |
@Override public Episode runLearningEpisode(Environment env, int maxSteps) { State initialState = env.currentObservation(); Episode e = new Episode(initialState); int eStepCounter = 0; while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){ //check state State curState = stateMapping.mapState(env.currentObservation()); //select action Action a = this.learningPolicy.action(curState); //take action EnvironmentOutcome eo = env.executeAction(a); //save outcome in memory this.memory.addExperience(eo); //record transition and manage option case int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1; eStepCounter += stepInc; this.totalSteps += stepInc; e.transition(a, eo.op, eo.r); //perform learners List<EnvironmentOutcome> samples = this.memory.sampleExperiences(this.numReplay); this.updateQFunction(samples); //update stale function this.stepsSinceStale++; if(this.stepsSinceStale >= this.staleDuration){ this.updateStaleFunction(); } } this.totalEpisodes++; return e; }
Example #6
Source File: UCT.java From burlap with Apache License 2.0 | 4 votes |
/** * Performs a rollout in the UCT tree from the given node, keeping track of how many new nodes can be added to the tree. * @param node the node from which to rollout * @param depth the depth of the node * @param childrenLeftToAdd the number of new subsequent nodes that can be connected to the tree * @return the sample return from rolling out from this node */ public double treeRollOut(UCTStateNode node, int depth, int childrenLeftToAdd){ numVisits++; if(depth == maxHorizon){ return 0.; } if(model.terminal(node.state.s())){ if(goalCondition != null && goalCondition.satisfies(node.state.s())){ foundGoal = true; foundGoalOnRollout = true; } DPrint.cl(debugCode, numRollOutsFromRoot + " Hit terminal at depth: " + depth); return 0.; } UCTActionNode anode = this.selectActionNode(node); if(anode == null){ //no actions can be performed in this state return 0.; } //sample the action EnvironmentOutcome eo = model.sample(node.state.s(), anode.action); HashableState shprime = this.stateHash(eo.op); double r = eo.r; int depthChange = 1; if(anode.action instanceof Option){ depthChange = ((EnvironmentOptionOutcome)eo).numSteps(); } UCTStateNode snprime = this.queryTreeIndex(shprime, depth+depthChange); double sampledReturn; boolean shouldConnectNode = false; double futureReturn; if(snprime != null){ //then this state already exists in the tree if(!anode.referencesSuccessor(snprime)){ //then this successor has not been generated by this state-action pair before and should be indexed anode.addSuccessor(snprime); } futureReturn = this.treeRollOut(snprime, depth + depthChange, childrenLeftToAdd); sampledReturn = r + Math.pow(gamma, depthChange) * futureReturn; } else{ //this state is not in the tree at this depth so create it snprime = stateNodeConstructor.generate(shprime, depth+1, actionTypes, actionNodeConstructor); //store it in the tree depending on how many new nodes have already been stored in this roll out if(childrenLeftToAdd > 0){ shouldConnectNode = true; } //and do an exploratory sample from it futureReturn = this.treeRollOut(snprime, depth + depthChange, childrenLeftToAdd-1); sampledReturn = r + gamma * futureReturn; } node.n++; anode.update(sampledReturn); if(shouldConnectNode || foundGoalOnRollout){ this.addNodeToIndexTree(snprime); anode.addSuccessor(snprime); uniqueStatesInTree.add(snprime.state); } return sampledReturn; }
Example #7
Source File: DeepQTester.java From burlap_caffe with Apache License 2.0 | 3 votes |
@Override public Episode runTestEpisode(Environment env, int maxSteps) { State initialState = env.currentObservation(); Episode e = new Episode(initialState); int eStepCounter = 0; while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){ //check state State curState = stateMapping.mapState(env.currentObservation()); //select action Action a = this.policy.action(curState); //take action EnvironmentOutcome eo = env.executeAction(a); //save outcome in memory this.memory.addExperience(eo); //record transition and manage option case int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1; eStepCounter += stepInc; e.transition(a, eo.op, eo.r); } return e; }
Example #8
Source File: QLearning.java From burlap with Apache License 2.0 | 2 votes |
@Override public Episode runLearningEpisode(Environment env, int maxSteps) { State initialState = env.currentObservation(); Episode ea = new Episode(initialState); HashableState curState = this.stateHash(initialState); eStepCounter = 0; maxQChangeInLastEpisode = 0.; while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){ Action action = learningPolicy.action(curState.s()); QValue curQ = this.getQ(curState, action); EnvironmentOutcome eo; if(!(action instanceof Option)){ eo = env.executeAction(action); } else{ eo = ((Option)action).control(env, this.gamma); } HashableState nextState = this.stateHash(eo.op); double maxQ = 0.; if(!eo.terminated){ maxQ = this.getMaxQ(nextState); } //manage option specifics double r = eo.r; double discount = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).discount : this.gamma; int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1; eStepCounter += stepInc; if(!(action instanceof Option) || !this.shouldDecomposeOptions){ ea.transition(action, nextState.s(), r); } else{ ea.appendAndMergeEpisodeAnalysis(((EnvironmentOptionOutcome)eo).episode); } double oldQ = curQ.q; //update Q-value curQ.q = curQ.q + this.learningRate.pollLearningRate(this.totalNumberOfSteps, curState.s(), action) * (r + (discount * maxQ) - curQ.q); double deltaQ = Math.abs(oldQ - curQ.q); if(deltaQ > maxQChangeInLastEpisode){ maxQChangeInLastEpisode = deltaQ; } //move on polling environment for its current state in case it changed during processing curState = this.stateHash(env.currentObservation()); this.totalNumberOfSteps++; } return ea; }