burlap.behavior.singleagent.options.EnvironmentOptionOutcome Java Exaples

Source File: DynamicWeightedAStar.java From burlap with Apache License 2.0

6 votes

public double computeF(PrioritizedSearchNode parentNode, Action generatingAction, HashableState successorState, EnvironmentOutcome eo) {
	double cumR = 0.;
	int d = 0;
	if(parentNode != null){
		double pCumR = cumulatedRewardMap.get(parentNode.s);
		cumR = pCumR + eo.r;
		
		int pD = depthMap.get(parentNode.s);
		if(!(generatingAction instanceof Option)){
			d = pD + 1;
		}
		else{
			d = pD + ((EnvironmentOptionOutcome)eo).numSteps();
		}
	}
	
	double H  = heuristic.h(successorState.s());
	lastComputedCumR = cumR;
	lastComputedDepth = d;
	double weightedE = this.epsilon * this.epsilonWeight(d);
	double F = cumR + ((1. + weightedE)*H);
	
	return F;
}

Source File: SparseSampling.java From burlap with Apache License 2.0

5 votes

/**
 * Estimates the Q-value using sampling from the transition dynamics. This is the standard Sparse Sampling procedure.
 * @param ga the action for which the Q-value estimate is to be returned
 * @return the Q-value estimate
 */
protected double sampledQEstimate(Action ga){
	
	double sum = 0.;
	
	//generate C samples
	int c = SparseSampling.this.getCAtHeight(this.height);
	for(int i = 0; i < c; i++){
		
		//execute
		EnvironmentOutcome eo = model.sample(sh.s(), ga);
		State ns = eo.op;
		
		//manage option stepsize modifications
		int k = 1;
		if(ga instanceof Option){
			k = ((EnvironmentOptionOutcome)ga).numSteps();
		}
		
		//get reward; our rf will automatically do cumumative discounted if it's an option
		double r = eo.r;
		
		StateNode nsn = SparseSampling.this.getStateNode(ns, this.height-k);
		
		sum += r + Math.pow(SparseSampling.this.gamma, k)*nsn.estimateV();
	}
	sum /= (double)c;
	
	return sum;
}

Source File: OptionsExample.java From burlap_examples with MIT License

4 votes

public static Episode optionExecuteResult(SADomain domain, Option o, State s){
	SimulatedEnvironment env = new SimulatedEnvironment(domain, s);
	EnvironmentOptionOutcome eo = o.control(env, 0.99);
	return eo.episode;
}

Source File: PolicyUtils.java From burlap with Apache License 2.0

4 votes

/**
 * Follows this policy for one time step in the provided {@link burlap.mdp.singleagent.environment.Environment} and
 * records the interaction in the provided {@link Episode} object. If the policy
 * selects an {@link burlap.behavior.singleagent.options.Option}, then how the option's interaction in the environment
 * is recorded depends on the {@link #rolloutsDecomposeOptions} flag.
 * If {@link #rolloutsDecomposeOptions} is false, then the option is recorded as a single action. If it is true, then
 * the individual primitive actions selected by the environment are recorded.
 * @param p the {@link Policy}
 * @param env The {@link burlap.mdp.singleagent.environment.Environment} in which this policy should be followed.
 * @param ea The {@link Episode} object to which the action selection will be recorded.
 */
protected static void followAndRecordPolicy(Policy p, Environment env, Episode ea){


	//follow policy
	Action a = p.action(env.currentObservation());
	if(a == null){
		throw new PolicyUndefinedException();
	}


	EnvironmentOutcome eo = env.executeAction(a);


	if(a instanceof Option && rolloutsDecomposeOptions){
		ea.appendAndMergeEpisodeAnalysis(((EnvironmentOptionOutcome)eo).episode);
	}
	else{
		ea.transition(a, eo.op, eo.r);
	}

}

Source File: ApproximateQLearning.java From burlap with Apache License 2.0

4 votes

@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {

	State initialState = env.currentObservation();
	Episode e = new Episode(initialState);


	int eStepCounter = 0;
	while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){

		//check state
		State curState = stateMapping.mapState(env.currentObservation());

		//select action
		Action a = this.learningPolicy.action(curState);

		//take action
		EnvironmentOutcome eo = env.executeAction(a);

		//save outcome in memory
		this.memory.addExperience(eo);

		//record transition and manage option case
		int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1;
		eStepCounter += stepInc;
		this.totalSteps += stepInc;
		e.transition(a, eo.op, eo.r);

		//perform learners
		List<EnvironmentOutcome> samples = this.memory.sampleExperiences(this.numReplay);
		this.updateQFunction(samples);

		//update stale function
		this.stepsSinceStale++;
		if(this.stepsSinceStale >= this.staleDuration){
			this.updateStaleFunction();
		}

	}

	this.totalEpisodes++;
	return e;
}

Source File: UCT.java From burlap with Apache License 2.0

4 votes

/**
 * Performs a rollout in the UCT tree from the given node, keeping track of how many new nodes can be added to the tree.
 * @param node the node from which to rollout
 * @param depth the depth of the node
 * @param childrenLeftToAdd the number of new subsequent nodes that can be connected to the tree
 * @return the sample return from rolling out from this node
 */
public double treeRollOut(UCTStateNode node, int depth, int childrenLeftToAdd){
	
	numVisits++;
	
	if(depth == maxHorizon){
		return 0.;
	}
	
	if(model.terminal(node.state.s())){
		if(goalCondition != null && goalCondition.satisfies(node.state.s())){
		    foundGoal = true;
               foundGoalOnRollout = true;
		}
		DPrint.cl(debugCode, numRollOutsFromRoot + " Hit terminal at depth: " + depth);
		return 0.;
	}
	
	
	
	UCTActionNode anode = this.selectActionNode(node);
	
	if(anode == null){
		//no actions can be performed in this state
		return 0.;
	}
	
	
	
	//sample the action
	EnvironmentOutcome eo = model.sample(node.state.s(), anode.action);
	HashableState shprime = this.stateHash(eo.op);
	double r = eo.r;
	int depthChange = 1;
	if(anode.action instanceof Option){
		depthChange = ((EnvironmentOptionOutcome)eo).numSteps();
	}
	
	UCTStateNode snprime = this.queryTreeIndex(shprime, depth+depthChange);
	
	double sampledReturn;
	
	boolean shouldConnectNode = false;
	double futureReturn;
	if(snprime != null){
		
		//then this state already exists in the tree
		
		if(!anode.referencesSuccessor(snprime)){ 
			//then this successor has not been generated by this state-action pair before and should be indexed
			anode.addSuccessor(snprime);
		}
		
		futureReturn = this.treeRollOut(snprime, depth + depthChange, childrenLeftToAdd);
		sampledReturn = r + Math.pow(gamma, depthChange) * futureReturn;
		
	}
	else{
		
		//this state is not in the tree at this depth so create it
		snprime = stateNodeConstructor.generate(shprime, depth+1, actionTypes, actionNodeConstructor);
		
		//store it in the tree depending on how many new nodes have already been stored in this roll out
		if(childrenLeftToAdd > 0){
			shouldConnectNode = true;
		}
		
		//and do an exploratory sample from it
		futureReturn = this.treeRollOut(snprime, depth + depthChange, childrenLeftToAdd-1);
		sampledReturn = r + gamma * futureReturn;
		
		
	}
	
	node.n++;
	anode.update(sampledReturn);
	
	if(shouldConnectNode || foundGoalOnRollout){
		this.addNodeToIndexTree(snprime);
		anode.addSuccessor(snprime);
		uniqueStatesInTree.add(snprime.state);
	}
	
	
	return sampledReturn;
}

Source File: DeepQTester.java From burlap_caffe with Apache License 2.0

3 votes

@Override
public Episode runTestEpisode(Environment env, int maxSteps) {

    State initialState = env.currentObservation();
    Episode e = new Episode(initialState);


    int eStepCounter = 0;
    while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){

        //check state
        State curState = stateMapping.mapState(env.currentObservation());

        //select action
        Action a = this.policy.action(curState);

        //take action
        EnvironmentOutcome eo = env.executeAction(a);

        //save outcome in memory
        this.memory.addExperience(eo);

        //record transition and manage option case
        int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1;
        eStepCounter += stepInc;
        e.transition(a, eo.op, eo.r);

    }

    return e;
}

Source File: QLearning.java From burlap with Apache License 2.0

2 votes

@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {

	State initialState = env.currentObservation();

	Episode ea = new Episode(initialState);
	HashableState curState = this.stateHash(initialState);
	eStepCounter = 0;

	maxQChangeInLastEpisode = 0.;
	while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){

		Action action = learningPolicy.action(curState.s());
		QValue curQ = this.getQ(curState, action);



		EnvironmentOutcome eo;
		if(!(action instanceof Option)){
			eo = env.executeAction(action);
		}
		else{
			eo = ((Option)action).control(env, this.gamma);
		}



		HashableState nextState = this.stateHash(eo.op);
		double maxQ = 0.;

		if(!eo.terminated){
			maxQ = this.getMaxQ(nextState);
		}

		//manage option specifics
		double r = eo.r;
		double discount = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).discount : this.gamma;
		int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1;
		eStepCounter += stepInc;

		if(!(action instanceof Option) || !this.shouldDecomposeOptions){
			ea.transition(action, nextState.s(), r);
		}
		else{
			ea.appendAndMergeEpisodeAnalysis(((EnvironmentOptionOutcome)eo).episode);
		}



		double oldQ = curQ.q;

		//update Q-value
		curQ.q = curQ.q + this.learningRate.pollLearningRate(this.totalNumberOfSteps, curState.s(), action) * (r + (discount * maxQ) - curQ.q);

		double deltaQ = Math.abs(oldQ - curQ.q);
		if(deltaQ > maxQChangeInLastEpisode){
			maxQChangeInLastEpisode = deltaQ;
		}

		//move on polling environment for its current state in case it changed during processing
		curState = this.stateHash(env.currentObservation());
		this.totalNumberOfSteps++;


	}


	return ea;

}

burlap.behavior.singleagent.options.EnvironmentOptionOutcome Java Examples