/** * This applet demonstrates a simple game. It isn't designed to be general or reusable.

* This program gives core of the simulation. The GUI is in SGameGUI.java. The environment code is at SGameEnv.java. This controller is at SGameQController.java.

This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * @author David Poole poole@cs.ubc.ca * @version 0.41 2007-09-09 */ public class TGameQController extends TGameController { /** * Construct a new controller with the given environment */ TGameQController(TGameEnv environment) { super(environment); title = "Q-learning contoller"; } //String title = "Q-learning/SARSA contoller"; /** qvalues[xpos,ypos,action] */ double qvalues[][][] = new double[2][3][4]; int visits[][][] = new int[2][3][4]; /** The GUI uses qvalue(x,y,a) to display values and for the arrows. */ public double qvalue(int xval, int yval, int action) { return qvalues[xval][yval][action]; }; // surely this exists somewhere! int toInt(boolean Boo) { if (Boo) return 1; else return 0; } /** * resets the Q-values. * * @param initVal the initial value given by a box in the GUI */ public void doreset(double initVal) { for (int x=0; x<2; x++) for (int y=0; y<3; y++) for (int a=0; a<4; a++) { qvalues[x][y][a]=initVal; visits[x][y][a]=0; } } /** old X - needs to be remebered through steps for SARSA */ int oldX=0; /** old Y - needs to be remebered through steps for SARSA */ int oldY=2; /** prevAction - previous action */ int prevAction=0; /** prevReward - previous reward */ double prevReward=0.0; /** * does one step. * * carries out the action in the environment. This may be a place * to record what the agent has learned from its experience. *

The actions are

0 is up
1 is right
2 is down
3 is careful up

* @param action the action that the agent does */ public void dostep(int action) { if (!sarsa) // Q-learning { oldX = environment.currX; oldY = environment.currY; double reward = environment.dostep(action); int newX = environment.currX; int newY = environment.currY; double newVal= value(newX,newY); double newDatum=reward+discount*newVal; visits[oldX][oldY][action]++; if (!alphaFixed) alpha = 1.0/visits[oldX][oldY][action]; qvalues[oldX][oldY][action]= (1-alpha) * qvalues[oldX][oldY][action] + alpha*newDatum; // The following give more detailed tracing // if (environment.tracing) // System.out.println(" qvalues["+oldX+"]["+oldY+"]["+action+"]= "+(1-alpha)+"*"+ qvalues[oldX][oldY][action]+"+"+alpha+"*"+newDatum); } else // SARSA { int currX = environment.currX; int currY = environment.currY; double newDatum=prevReward+discount*qvalues[currX][currY][action] ; visits[oldX][oldY][prevAction]++; if (!alphaFixed) alpha = 1.0/visits[oldX][oldY][prevAction]; qvalues[oldX][oldY][prevAction]= (1-alpha) * qvalues[oldX][oldY][prevAction] + alpha*newDatum; // The following give more detailed tracing // if (environment.tracing) // System.out.println(" qvalues["+oldX+"]["+oldY+"]["+prevAction+"]= "+(1-alpha)+"*"+ qvalues[oldX][oldY][prevAction]+ "+"+alpha+"*"+newDatum); prevReward = environment.dostep(action); oldX=currX; oldY=currY; prevAction=action; } } /** * determines the value of a state * * the value is the maximum, for all actions, of the q-value * * @param xval the x-coordinate * @param yval the y-coordinate * @return the value of the (xval,yval) position */ public double value(int xval, int yval) { double val=qvalues[xval][yval][3]; for (int i=2; i>=0; i--) { if(qvalues[xval][yval][i]>val) { val=qvalues[xval][yval][i]; } } return val; } /** * does count number of steps * * This is where you would put your controller * @param count the number of steps to do * @param greedyProb the probability that is step is chosen greedily */ public void doSteps(int count, double greedyProb){ for(int i=0; i bestVal) { bestVal = qvalues[environment.currX][environment.currY][startDir]; bestDir = startDir; } } dostep(bestDir); } else { // act randomly dostep((int) (Math.random() * 4)); } } } }