Q_learning.cpp

#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <ctime>
#include <cstdlib>

using namespace std;


const int row = 13;
const int col = 13;


double gamma = .8;
double alpha = .1;
double R_step = 200;

double R[row][col] = {0};
double Q[row][col] = {0};
 
bool Goal = false;

int iterations = 1;
int it_;
int user_action;
int update_final_state;
double Q_next_state;

int i,j;
double Q_curr_state = Q[i][j];

double reward;

int R_indx_i = row - row;
int R_indx_j = .5 * col;

int P_indx_i = row - 2;
int P_indx_j = col - 1;

int counter = 1;
int Time_Reward;
double sample;
void print_R();
void print_Q();
void generate_rand();

int main()
{
        R[R_indx_i][R_indx_j] = 50; // reward
        R[P_indx_i][P_indx_j] = -100; // punishment

        print_R();
        
        cout << "\n iterations ? \n" ;
        cin >> it_;

        /* initialize random seed: */
  	srand (time(NULL));

        while ( iterations <= it_ ) 
        {       
                if (user_action == 1 && i != 0) // North
                {
                        reward = R[i][j];
                        Q_next_state = Q[i - 1][j];
                        
                        sample = reward + gamma * Q_next_state;
                        Q[i][j] = ((1 - alpha) * Q[i][j]) + (alpha * sample);
                        Q_curr_state = Q_next_state;
                        //printf(" Q_current_state = %f \n",Q_curr_state);
                        i--;
                        counter++;
                } else if (user_action == 1 && i == 0) // North
                {
                        cout << "You can't go further up!\n";
                } else if (user_action == 3 && i < (row - 1)) // South, i < row
                {
                        reward = R[i][j];
                        Q_next_state = Q[i + 1][j];
                        
                        sample = reward + gamma * Q_next_state;
                        Q[i][j] = ((1 - alpha) * Q[i][j]) + (alpha * sample);
                        Q_curr_state = Q_next_state;
                        //printf(" Q_current_state = %f \n",Q_curr_state);
                        i++;
                        counter++;
                } else if (user_action == 3 && i >= (row - 1)) // South
                {
                        cout << "You can't go further down!\n";
                } else if (user_action == 2 && j < (col - 1)) // East
                {
                        reward = R[i][j];
                        Q_next_state = Q[i][j + 1];
                        
                        sample = reward + gamma * Q_next_state;
                        Q[i][j] = ((1 - alpha) * Q[i][j]) + (alpha * sample);
                        Q_curr_state = Q_next_state;
                        //printf(" Q_current_state = %f \n",Q_curr_state);
                        j++;
                        counter++;
                } else if (user_action == 2 && j >= (col - 1)) // East, j > col
                {
                        cout << "You can't go further right!\n";
                } else if (user_action == 4 && j != 0 ) // West
                {
                        reward = R[i][j];
                        Q_next_state = Q[i][j - 1];
                        
                        sample = reward + gamma * Q_next_state;
                        Q[i][j] = ((1 - alpha) * Q[i][j]) + (alpha * sample);
                        Q_curr_state = Q_next_state;
                        //printf(" Q_current_state = %f \n",Q_curr_state);
                        j--;
                        counter++;
                } else if (user_action == 4 && j == 0) // West, j = 1
                {
                        cout << "You can't go further left!\n";
                } else
                {
                        cout << "\nGenerating random pose in grid for 1st. time!\n";
                        generate_rand();
                }
                
                // + Reward
                if (i == R_indx_i && j == R_indx_j)
                {
                        Time_Reward = -counter;
                        cout << " Time Reward = "<< Time_Reward << "\n";
                        
                        if(abs(Time_Reward) <= R_step)
                        {
                                
                                cout << "\n Goal is achieved <= " << R_step << " time steps\n";
                                reward = R[i][j];
                                Q_next_state = 0;
                        
                                sample = reward + gamma * Q_next_state;
                                Q[i][j] = ((1 - alpha) * Q[i][j]) + (alpha * sample);
                        } else
                        {
                                cout << "\n Goal is achieved > " << R_step << " time steps => time_punishment\n";
                                reward = -1; // ???
                                Q_next_state = 0;
                        
                                sample = reward + gamma * Q_next_state;
                                Q[i][j] = ((1 - alpha) * Q[i][j]) + (alpha * sample);
                        }
                        
                        counter = 1;
                        print_Q();
                        generate_rand();
                        iterations++;
                                
                                //Goal = true;     
                } else if (i == P_indx_i && j == P_indx_j) // - Reward => Punishment
                {
                           cout << "\n Failed to achieve a goal! \n";
                                
                                reward = R[i][j];
                                Q_next_state = 0;
                        
                                sample = reward + gamma * Q_next_state;
                                Q[i][j] = ((1 - alpha) * Q[i][j]) + (alpha * sample);
                                
                                print_Q();
                                generate_rand();
                                iterations++;
                                //Goal = true;     
                }
                
                cout << "\n Q_value = " << Q_curr_state << " , actions N(1), E(2), S(3), W(4) : ";
                user_action = ((double) rand() / (RAND_MAX)) * (5 - 1) + 1;
                printf(" ramdom user action = %i \n",user_action);
                //cin >> user_action;
                
          }
return 0;
}

void print_R()
{
         cout << " R = \n";
        for(int i = 0; i <= (row - 1); i++)
        {
                for(int j = 0; j <= (col - 1); j++)
                {
                        cout << setw(col - 1) << R[i][j];
			if(j < col - 1)
			{
				cout << " , ";
			}
		} // j
                cout << "\n";
        } // i
        cout << "\n";
}
void print_Q()
{
         cout << " Q = \n";
        for(int i = 0; i <= (row - 1); i++)
        {
                for(int j = 0; j <= (col - 1); j++)
                {
                        cout << setw(col - 1) << Q[i][j];
			if(j < col - 1)
			{
				cout << " , ";
			}
		} // j
                cout << "\n";
        } // i
        cout << "\n";
}

void generate_rand()
{

        i = ((double) rand() / (RAND_MAX)) * (row) ;
        j = ((double) rand() / (RAND_MAX)) * (col) ;
        
        Q_curr_state = Q[i][j];
        
        cout << "\n i = " << i << " , j = " << j << " => Q-value[i][j] = " << Q_curr_state << " \n";
        
}