QLearner/QLearner.py at main · timothy/QLearner · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
"""
Q-Learning Robot Navigation with Dyna-Q Extension

This module implements a Q-Learning algorithm with Dyna-Q model-based planning
for robot navigation tasks. The agent learns to navigate through a grid world
environment to reach a goal while avoiding obstacles.

Author: Timothy Bradford
License: MIT License

Key Features:
- Standard Q-Learning with epsilon-greedy exploration
- Dyna-Q model-based planning for faster learning
- Configurable learning parameters
- Support for grid-world navigation tasks
"""

import random as rand
import numpy as np


class QLearner:
    """
    Q-Learning agent with optional Dyna-Q model-based planning.

    The agent learns optimal navigation policies through trial and error,
    building a Q-table that maps state-action pairs to expected rewards.

    State representation: row_location * 10 + column_location
    Actions: 0=North, 1=East, 2=South, 3=West
    """

    def __init__(
        self,
        num_states=100,
        num_actions=4,
        alpha=0.2,
        gamma=0.9,
        rar=0.5,
        radr=0.99,
        dyna=0,
        verbose=False,
    ):
        """
        Initialize the Q-Learning agent.

        Args:
            num_states: Number of possible states in the environment
            num_actions: Number of available actions (typically 4 for grid navigation)
            alpha: Learning rate (0.0-1.0), controls how quickly Q-values are updated
            gamma: Discount factor (0.0-1.0), weights importance of future rewards
            rar: Random action rate, probability of exploration vs exploitation
            radr: Random action decay rate, reduces exploration over time
            dyna: Number of planning steps per real experience (0 = disabled)
            verbose: Enable debug output
        """
        # Core parameters
        self.alpha = alpha
        self.gamma = gamma
        self.rar = rar
        self.radr = radr
        self.dyna = dyna
        self.verbose = verbose
        self.num_actions = num_actions

        # Q-table: stores learned action values for each state
        self.Q = np.zeros((num_states, num_actions))

        # Current state and action
        self.s = 0
        self.a = 0

        # Dyna-Q model components
        if dyna > 0:
            # Transition model: T[s,a] = s' (deterministic)
            self.T = np.full((num_states, num_actions), -1, dtype=np.int32)
            # Reward model: R[s,a] = expected reward
            self.R = np.zeros((num_states, num_actions))
            # Track which state-action pairs have been visited
            self.visited_states = set()
            self.visited_state_actions = set()

    def querysetstate(self, s):
        """
        Set the agent's state and return initial action without learning.

        Used to initialize the agent at the start of an episode.

        Args:
            s: The starting state

        Returns:
            The selected action for the given state
        """
        self.s = s
        action = self._select_action(s)
        self.a = action

        if self.verbose:
            print(f"Initial state: s={s}, action={action}")

        return action

    def query(self, s_prime, r):
        """
        Core learning method: update Q-table and return next action.

        Updates the Q-table based on the experience tuple (s, a, s', r),
        optionally performs Dyna planning, and selects the next action.

        Args:
            s_prime: The new state after taking action a from state s
            r: The reward received for the transition

        Returns:
            The next action to take from state s_prime
        """
        # Update Q-table with real experience
        self._update_q_table(self.s, self.a, s_prime, r)

        # Perform Dyna-Q planning if enabled
        if self.dyna > 0:
            self._dyna_planning(self.s, self.a, s_prime, r)

        # Select next action
        a_prime = self._select_action(s_prime)

        # Decay exploration rate
        self.rar *= self.radr

        if self.verbose:
            print(f"Experience: s={self.s}, a={self.a}, s'={s_prime}, r={r:.2f}, next_a={a_prime}")

        # Update current state and action
        self.s = s_prime
        self.a = a_prime

        return a_prime

    def _select_action(self, state):
        """
        Select an action using epsilon-greedy strategy.

        Args:
            state: Current state

        Returns:
            Selected action
        """
        if rand.uniform(0, 1) < self.rar:
            # Exploration: random action
            return rand.randint(0, self.num_actions - 1)
        else:
            # Exploitation: best known action
            return int(np.argmax(self.Q[state]))

    def _update_q_table(self, s, a, s_prime, r):
        """
        Update Q-value using the Q-learning update rule.

        Q(s,a) ← Q(s,a) + α[r + γ·max_a' Q(s',a') - Q(s,a)]

        Args:
            s: Previous state
            a: Action taken
            s_prime: Resulting state
            r: Reward received
        """
        current_q = self.Q[s, a]
        max_future_q = np.max(self.Q[s_prime])

        # TD target
        target = r + self.gamma * max_future_q

        # TD error
        td_error = target - current_q

        # Update Q-value
        self.Q[s, a] = current_q + self.alpha * td_error

    def _dyna_planning(self, s, a, s_prime, r):
        """
        Perform Dyna-Q model-based planning.

        Updates the internal model of the environment and uses it
        to generate simulated experiences for additional learning.

        Args:
            s: State
            a: Action
            s_prime: Next state
            r: Reward
        """
        # Update the model with real experience
        self._update_model(s, a, s_prime, r)

        # Perform planning steps
        for _ in range(self.dyna):
            # Sample from previously visited state-action pairs
            sim_s, sim_a = self._sample_state_action()

            if sim_s is not None:
                # Use model to predict next state and reward
                sim_s_prime, sim_r = self._model_predict(sim_s, sim_a)

                if sim_s_prime is not None:
                    # Update Q-table with simulated experience
                    self._update_q_table(sim_s, sim_a, sim_s_prime, sim_r)

    def _update_model(self, s, a, s_prime, r):
        """
        Update the internal model of the environment.

        Args:
            s: State
            a: Action
            s_prime: Next state
            r: Reward
        """
        # Deterministic transition model
        self.T[s, a] = s_prime

        # Update reward model (running average could be used for stochastic rewards)
        self.R[s, a] = r

        # Track visited states and state-action pairs
        self.visited_states.add(s)
        self.visited_state_actions.add((s, a))

    def _sample_state_action(self):
        """
        Sample a random state-action pair from previously visited ones.

        Returns:
            Tuple of (state, action) or (None, None) if no history exists
        """
        if not self.visited_state_actions:
            return None, None

        # Random sampling from visited state-action pairs
        s, a = rand.choice(list(self.visited_state_actions))
        return s, a

    def _model_predict(self, s, a):
        """
        Use the model to predict next state and reward.

        Args:
            s: State
            a: Action

        Returns:
            Tuple of (next_state, reward) or (None, 0) if not in model
        """
        s_prime = self.T[s, a]

        # Check if this state-action has been visited
        if s_prime == -1:
            return None, 0

        r = self.R[s, a]
        return s_prime, r

    def get_q_table(self):
        """
        Get a copy of the Q-table for analysis or visualization.

        Returns:
            Copy of the Q-table
        """
        return self.Q.copy()

    def get_policy(self):
        """
        Extract the learned policy from the Q-table.

        Returns:
            Array where each element is the best action for that state
        """
        return np.argmax(self.Q, axis=1)

    def get_value_function(self):
        """
        Get the state value function V(s) = max_a Q(s,a).

        Returns:
            Array of state values
        """
        return np.max(self.Q, axis=1)

    def reset_exploration(self, rar=0.5):
        """
        Reset the exploration rate (useful for new episodes).

        Args:
            rar: New random action rate
        """
        self.rar = rar


if __name__ == "__main__":
    # Example usage
    print("Q-Learning Agent with Dyna-Q Planning")
    print("-" * 40)

    # Create a simple Q-learner
    learner = QLearner(
        num_states=100,
        num_actions=4,
        alpha=0.2,
        gamma=0.9,
        rar=0.5,
        radr=0.99,
        dyna=200,  # Enable Dyna-Q with 200 planning steps
        verbose=False
    )

    print(f"Agent initialized with:")
    print(f"  States: 100")
    print(f"  Actions: 4 (N, E, S, W)")
    print(f"  Learning rate: 0.2")
    print(f"  Discount factor: 0.9")
    print(f"  Dyna planning steps: 200")