/**
 * This applet demonstrates a simple game. It isn't designed to be general or reusable.
<p>
 * Copyright (C) 2006  <A HREF="http://www.cs.ubc.ca/spider/poole/">David Poole</A>.
<p>
 * This program gives core of the simulation. The GUI is in <A HREF=SGameGUI.java">SGameGUI.java</A>.  The environment code is at <A HREF="SGameEnv.java">SGameEnv.java</A>. This controller is at <A HREF="SGameQController.java">SGameQController.java</A>.
<p>
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.
<p>
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
<p>
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.


 * @author David Poole  poole@cs.ubc.ca
 * @version 0.42 2011-04-19 */

public class SGameQController extends SGameController
{
    // SIZES of the domains
    public final int X_SIZE=5;
    public final int Y_SIZE=5;
    public final int PRIZE_SIZE=5;
    public final int DAMAGE_SIZE=2;
    public final int STATE_SIZE=250;
    public final int ACTION_SIZE=4;

    /**
     * Construct a new controller with the given environment
    */
    SGameQController() {
    	super();
    	title = "Q-learning contoller";
    }
	//String title = "Q-learning contoller";
    
    /**
       qvalues[state,action] gives Q_values
     */
    double qvalues[][] = new  double[STATE_SIZE][ACTION_SIZE];
    /**
       visits[state,action] gives number of times action has been carried out in state
     */
    int visits[][] = new  int[STATE_SIZE][ACTION_SIZE];

    /**
       The GUI uses qvalue(x,y,a) to display values and arrows.  
     */
    public double qvalue(int xval, int yval, int action) 
    {
	return qvalues[state(xval,yval,environment.prize,environment.damaged)][action];
    };

    /**
       The GUI uses getCounts(x,y,a) to display counts.  
     */
    public int getCounts(int xval, int yval, int action) 
    {
	return visits[state(xval,yval,environment.prize,environment.damaged)][action];
    };

    /**
     * gets the state from the xpos, ypos, prize, damage
     */
    public int state(int xval, int yval, int prize, boolean damage)
    {
	return ((xval*Y_SIZE+yval)*PRIZE_SIZE+prize)*DAMAGE_SIZE+(damage?1:0);
    }

    /**
     * resets the Q-values.
     *
     * @param initVal   the initial value given by a box in the GUI
     */
    public void doreset(double initVal)
    {     
	for (int st=0; st<STATE_SIZE; st++)
	    for (int a=0; a<ACTION_SIZE; a++)
		{
		    qvalues[st][a]=initVal;
		    visits[st][a]=0;
		}
    }


    /**
     * does one step.
     *
     * carries out the action in the environment. This may be a place
     * to record what the agent has learned from its experience.
     *
     <p>
     The actions are
     <ul>
     <li> 0 is up
     <li> 1 is right
     <li> 2 is down
     <li> 3 is left
     </ul>
     * @param action  the action that the agent does
     */
    public void dostep(int action)  { 
	int oldState = state(environment.currX,environment.currY,environment.prize,environment.damaged);

	double reward = environment.dostep(action);
	// if (reward != 0)
	//     System.out.println("reward: "+reward);
	int newState = state(environment.currX,environment.currY,environment.prize,environment.damaged);

	double newDatum=reward+discount*value(newState);
	visits[oldState][action]++;
	if (!alphaFixed)
	    alpha = 1.0/visits[oldState][action];
		
	qvalues[oldState][action]=
	    (1-alpha) * qvalues[oldState][action] 
	    + alpha*newDatum;
	// if (qvalues[oldState][action]!= 0)
	//     System.out.println("qvalues["+oldState+"]["+action+"]="+qvalues[oldState][action]);

    }

    /**
     * determines the value of a state
     *
     * the value is the maximum, for all actions, of the q-value
     *
     * @param state
     * @return the value of the state
     */
    public double value(int state) {
	double val=qvalues[state][0];
	for (int a=1; a<ACTION_SIZE; a++) {
	    if(qvalues[state][a]>val) {
		val=qvalues[state][a];
	    }
	}
	return val;
    }


    /**
     * do count number of steps
     *
     * This is where you would put your controller
     * @param count  the number of steps to do
     * @param greedyProb  the probability that is step is chosen greedily
     */
    public void doSteps(int count, double greedyProb){
	for(int i=0; i<count; i++)
	    {
		double rand = Math.random();
		if (rand<greedyProb)
		    {// act greedily
			int currState = state(environment.currX,environment.currY,environment.prize,environment.damaged);
			// in order to make sure that it does not choose the same action when there are 
			// multiple actions with the same Q-value, it starts at a random start direction
			// and returns the first action found with maximum Q-value
			int startDir = (int) (Math.random() * ACTION_SIZE);
			double bestVal = qvalues[currState][startDir];
			int bestDir = startDir;
			for (int dir=1; dir<ACTION_SIZE; dir++)
			    {
				startDir = (startDir+1) % ACTION_SIZE;
				if (qvalues[currState][startDir] > bestVal)
				    {
					bestVal = qvalues[currState][startDir];
					bestDir = startDir;
				    }
			    }
			dostep(bestDir);
		    }
		else
		    { // act randomly
			dostep((int) (Math.random() * ACTION_SIZE));
		    }
	    }
    }


}
