/**
 * This applet demonstrates a simple game. It isn't designed to be general or reusable.
<p>
 * Copyright (C) 2006  <A HREF="http://www.cs.ubc.ca/spider/poole/">David Poole</A>.
<p>
 * This program gives core of the simulation. The GUI is in <A HREF=SGameGUI.java">SGameGUI.java</A>.  The environment code is at <A HREF="SGameEnv.java">SGameEnv.java</A>. This controller is at <A HREF="SGameQController.java">SGameQController.java</A>.
<p>
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.
<p>
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
<p>
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.


 * @author David Poole  poole@cs.ubc.ca
 * @version 0.42 2011-04-19 */

public class SGameModelController extends SGameController
{
    // SIZES of the domains
    public final int X_SIZE=5;
    public final int Y_SIZE=5;
    public final int PRIZE_SIZE=5;
    public final int DAMAGE_SIZE=2;
    public final int STATE_SIZE=250;
    public final int ACTION_SIZE=4;


    /**
     * Construct a new controller with the given environment
    */
    SGameModelController(SGameGUI gui) {
    	super();
	gui.alphaText ="updates/step";
	alpha = 10;
    	title = "model-based contoller";
    }
    
    /**
       qvalues[state,action] gives Q_values
     */
    double qvalues[][] = new  double[STATE_SIZE][ACTION_SIZE];
    /**
       transitions[s,a,s'] gives number of times action a has been carried out in state s resulting in state s'
     */
    int transitions[][][] = new  int[STATE_SIZE][ACTION_SIZE][STATE_SIZE];
    /**
       count_actions_in_state[s,a] gives number of times action a has been carried out in state s.

       Note that count_actions_in_state can be computed from transition, but is cached.
     */
    int count_actions_in_state[][] = new  int[STATE_SIZE][ACTION_SIZE];
    /**
       rewards[s,a] gives expected reward for action a in state s
     */
    double rewards[][] = new  double[STATE_SIZE][ACTION_SIZE];

    public double num_updates() {
	return alpha;  // alpha represents the number of updates / step
    }

    /**
       The GUI uses qvalue(x,y,a) to display values and arrows.  
     */
    public double qvalue(int xval, int yval, int action) 
    {
	return qvalues[state(xval,yval,environment.prize,environment.damaged)][action];
    };

    /**
       The GUI uses getCounts(x,y,a) to display counts.  
     */
    public int getCounts(int xval, int yval, int action) 
    {
	return count_actions_in_state[state(xval,yval,environment.prize,environment.damaged)][action];
    };

    /**
     * gets the state from the xpos, ypos, prize, damage
     */
    public int state(int xval, int yval, int prize, boolean damage)
    {
	return ((xval*Y_SIZE+yval)*PRIZE_SIZE+prize)*DAMAGE_SIZE+(damage?1:0);
    }

    /**
     * resets the Q-values, and all of the transitions.
     *
     * @param initVal   the initial value given by a box in the GUI
     */
    public void doreset(double initVal)
    {     
	for (int st=0; st<STATE_SIZE; st++)
	    for (int a=0; a<ACTION_SIZE; a++)
		{
		    qvalues[st][a]=initVal;
		    count_actions_in_state[st][a]=0;
		    rewards[st][a]=0.0;
		    for (int rst=0; rst<STATE_SIZE; rst++)
			transitions[st][a][rst]=0;
		}
    }


    /**
     * does one step.
     *
     * carries out the action in the environment. This may be a place
     * to record what the agent has learned from its experience.
     *
     * @param action  the action that the agent does
     */
    public void dostep(int action)  { 
	int oldState = state(environment.currX,environment.currY,environment.prize,environment.damaged);

	double reward = environment.dostep(action);

	int newState = state(environment.currX,environment.currY,environment.prize,environment.damaged);

	count_actions_in_state[oldState][action]++;
	transitions[oldState][action][newState]++;
	rewards[oldState][action] += (reward-rewards[oldState][action])/count_actions_in_state[oldState][action];

	int s1=oldState;     // the first Q-update is for the state-action pair just carried out
	int a1=action; 
	for (int i=0; i< num_updates(); i++) {
	    if (count_actions_in_state[s1][a1]>0) {
		double qs1a1=0;
		for (int s2=0; s2<STATE_SIZE;s2++)
		    qs1a1+= transitions[s1][a1][s2]*(rewards[s1][a1]+discount*value(s2))/((double)count_actions_in_state[s1][a1]);
		qvalues[s1][a1]=qs1a1;
	    }
	    s1 = ((int) (Math.random() * STATE_SIZE));     // future Q-updates are for random states and actions
	    a1 = ((int) (Math.random() * ACTION_SIZE));
	}
    }

    /**
     * determines the value of a state
     *
     * the value is the maximum, for all actions, of the q-value
     *
     * @param state
     * @return the value of the state
     */
    public double value(int state) {
	double val=qvalues[state][0];
	for (int a=1; a<ACTION_SIZE; a++) {
	    if(qvalues[state][a]>val) {
		val=qvalues[state][a];
	    }
	}
	return val;
    }


    /**
     * do numsteps number of steps
     *
     * This is where you would put your controller
     * @param numsteps  the number of steps to do
     * @param greedyProb  the probability that is step is chosen greedily
     */
    public void doSteps(int numsteps, double greedyProb){
	for(int i=0; i<numsteps; i++)
	    {
		double rand = Math.random();
		if (rand<greedyProb)
		    {// act greedily
			int currState = state(environment.currX,environment.currY,environment.prize,environment.damaged);
			// in order to make sure that it does not choose the same action when there are 
			// multiple actions with the same Q-value, it starts at a random start direction
			// and returns the first action found with maximum Q-value
			int startDir = (int) (Math.random() * ACTION_SIZE);
			double bestVal = qvalues[currState][startDir];
			int bestDir = startDir;
			for (int dir=1; dir<ACTION_SIZE; dir++)
			    {
				startDir = (startDir+1) % ACTION_SIZE;
				if (qvalues[currState][startDir] > bestVal)
				    {
					bestVal = qvalues[currState][startDir];
					bestDir = startDir;
				    }
			    }
			dostep(bestDir);
		    }
		else
		    { // act randomly
			dostep((int) (Math.random() * ACTION_SIZE));
		    }
	    }
    }


}
