/**
 * This applet demonstrates a simple game. It isn't designed to be general or reusable.
<p>
 * Copyright (C) 2006  <A HREF="http://www.cs.ubc.ca/spider/poole/">David Poole</A>.
<p>
 * This program gives core of the simulation. The GUI is in <A HREF=SGameGUI.java">SGameGUI.java</A>.  The environment code is at <A HREF="SGameEnv.java">SGameEnv.java</A>. This controller is at <A HREF="SGameQController.java">SGameQController.java</A>.
<p>
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.
<p>
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
<p>
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.


 * @author David Poole  poole@cs.ubc.ca
 * @version 0.41 2007-09-09 */

public class TGameQController extends TGameController
{
    /**
     * Construct a new controller with the given environment
    */
    TGameQController(TGameEnv environment) {
    	super(environment);
    	title = "Q-learning contoller";
    }
	//String title = "Q-learning/SARSA contoller";
    
    /**
       qvalues[xpos,ypos,action]
     */
    double qvalues[][][] = new  double[2][3][4];
    int visits[][][] = new  int[2][3][4];
    /**
       The GUI uses qvalue(x,y,a) to display values and for the arrows.  
     */
    public double qvalue(int xval, int yval, int action) 
    {
	return qvalues[xval][yval][action];
    };

    // surely this exists somewhere!
     int toInt(boolean Boo)
    {
	if (Boo) 
	    return 1;
	else
	    return 0;
    }


    /**
     * resets the Q-values.
     *
     * @param initVal   the initial value given by a box in the GUI
     */
    public void doreset(double initVal)
    {     
	for (int x=0; x<2; x++)
	    for (int y=0; y<3; y++)
			for (int a=0; a<4; a++)
			    {
				qvalues[x][y][a]=initVal;
				visits[x][y][a]=0;
			    }
    }
    /**
       old X - needs to be remebered through steps for SARSA
   */
    int oldX=0;
    /**
       old Y - needs to be remebered through steps for SARSA
   */
    int oldY=2;
    /**
       prevAction - previous action
    */
    int prevAction=0;
    /**
       prevReward - previous reward
    */
    double prevReward=0.0;
    /**
     * does one step.
     *
     * carries out the action in the environment. This may be a place
     * to record what the agent has learned from its experience.
     *
     <p>
     The actions are
     <ul>
     <li> 0 is up
     <li> 1 is right
     <li> 2 is down
     <li> 3 is careful up
     </ul>
     * @param action  the action that the agent does
     */
    public void dostep(int action)  { 
	if (!sarsa) // Q-learning
	    { 
		oldX = environment.currX;
		oldY = environment.currY;

		double reward = environment.dostep(action);

		int newX = environment.currX;
		int newY = environment.currY;

		double newVal= value(newX,newY);
		double newDatum=reward+discount*newVal;
		visits[oldX][oldY][action]++;
		if (!alphaFixed)
		    alpha = 1.0/visits[oldX][oldY][action];
		
		qvalues[oldX][oldY][action]=
		    (1-alpha) * qvalues[oldX][oldY][action] 
		    + alpha*newDatum;
		// The following give more detailed tracing
// 		if (environment.tracing)
// 		    System.out.println("	qvalues["+oldX+"]["+oldY+"]["+action+"]= "+(1-alpha)+"*"+ qvalues[oldX][oldY][action]+"+"+alpha+"*"+newDatum);
	    }
	else // SARSA
	    {
		int currX = environment.currX;
		int currY = environment.currY;
		double newDatum=prevReward+discount*qvalues[currX][currY][action] ;
		visits[oldX][oldY][prevAction]++;
		if (!alphaFixed)
		    alpha = 1.0/visits[oldX][oldY][prevAction];

		qvalues[oldX][oldY][prevAction]=
		    (1-alpha) * qvalues[oldX][oldY][prevAction] 
		    + alpha*newDatum;
		// The following give more detailed tracing
// 		if (environment.tracing)
// 		    System.out.println("	qvalues["+oldX+"]["+oldY+"]["+prevAction+"]= "+(1-alpha)+"*"+ qvalues[oldX][oldY][prevAction]+ "+"+alpha+"*"+newDatum);
prevReward = environment.dostep(action);
                oldX=currX;
		oldY=currY;
		prevAction=action;
	    }
    }
    /**
     * determines the value of a state
     *
     * the value is the maximum, for all actions, of the q-value
     *
     * @param xval the x-coordinate
     * @param yval the y-coordinate
     * @return the value of the (xval,yval) position
     */
    public double value(int xval, int yval) {
	double val=qvalues[xval][yval][3];
	for (int i=2; i>=0; i--) {
	    if(qvalues[xval][yval][i]>val) {
		val=qvalues[xval][yval][i];
	    }
	}
	return val;
    }


    /**
     * does count number of steps
     *
     * This is where you would put your controller
     * @param count  the number of steps to do
     * @param greedyProb  the probability that is step is chosen greedily
     */
    public void doSteps(int count, double greedyProb){
	for(int i=0; i<count; i++)
	    {
		double rand = Math.random();
		if (rand<greedyProb)
		    {// act greedily
			int startDir = (int) (Math.random() * 4);
			double bestVal = qvalues[environment.currX][environment.currY][startDir];
			int bestDir = startDir;
			for (int dir=1; dir<4; dir++)
			    {
				startDir = (startDir+1)%4;
				if (qvalues[environment.currX][environment.currY][startDir] > bestVal)
				    {
					bestVal = qvalues[environment.currX][environment.currY][startDir];
					bestDir = startDir;
				    }
			    }
			dostep(bestDir);
		    }
		else
		    { // act randomly
			dostep((int) (Math.random() * 4));
		    }
	    }
    }


}
