update the frozenlake environment
Browse files- .vscode/launch.json +1 -1
- code/Lake application/envs/frozen_lake.py +37 -101
.vscode/launch.json
CHANGED
|
@@ -11,7 +11,7 @@
|
|
| 11 |
"request": "launch",
|
| 12 |
"program": "${file}",
|
| 13 |
"console": "integratedTerminal",
|
| 14 |
-
"cwd": "C:\\Users\\leona\\Meu Drive\\USP\\Doutorado\\PoliTO\\pdppo\\code\\
|
| 15 |
|
| 16 |
}
|
| 17 |
]
|
|
|
|
| 11 |
"request": "launch",
|
| 12 |
"program": "${file}",
|
| 13 |
"console": "integratedTerminal",
|
| 14 |
+
"cwd": "C:\\Users\\leona\\Meu Drive\\USP\\Doutorado\\PoliTO\\pdppo\\code\\Lot-sizing\\"
|
| 15 |
|
| 16 |
}
|
| 17 |
]
|
code/Lake application/envs/frozen_lake.py
CHANGED
|
@@ -130,6 +130,7 @@ class FrozenLakeEnv(discrete.DiscreteEnv):
|
|
| 130 |
break
|
| 131 |
if goal_position:
|
| 132 |
break
|
|
|
|
| 133 |
|
| 134 |
def proximity_reward(current_row, current_col):
|
| 135 |
goal_row, goal_col = goal_position
|
|
@@ -140,92 +141,19 @@ class FrozenLakeEnv(discrete.DiscreteEnv):
|
|
| 140 |
newrow, newcol = inc(row, col, a)
|
| 141 |
newstate = to_s(newrow, newcol)
|
| 142 |
newletter = desc[newrow, newcol]
|
| 143 |
-
terminated = bytes(newletter) in b"
|
| 144 |
reward = float(newletter == b"G")
|
| 145 |
if not terminated:
|
| 146 |
-
reward = proximity_reward(newrow, newcol)
|
| 147 |
return newstate, reward, terminated
|
| 148 |
|
| 149 |
-
# def update_probability_matrix(row, col, action):
|
| 150 |
-
# newrow, newcol = inc(row, col, action)
|
| 151 |
-
# newstate = to_s(newrow, newcol)
|
| 152 |
-
# newletter = desc[newrow, newcol]
|
| 153 |
-
# done = bytes(newletter) in b"GH"
|
| 154 |
-
# reward = float(newletter == b"G")
|
| 155 |
-
# return newstate, reward, done
|
| 156 |
-
|
| 157 |
-
# for row in range(nrow):
|
| 158 |
-
# for col in range(ncol):
|
| 159 |
-
# s = to_s(row, col)
|
| 160 |
-
# for a in range(4):
|
| 161 |
-
# li = P[s][a]
|
| 162 |
-
# letter = desc[row, col]
|
| 163 |
-
# if letter in b"GH":
|
| 164 |
-
# li.append((1.0, s, 0, True))
|
| 165 |
-
# else:
|
| 166 |
-
# if is_slippery:
|
| 167 |
-
# for b in [(a - 1) % 4, a, (a + 1) % 4]:
|
| 168 |
-
# li.append(
|
| 169 |
-
# (1.0 / 3.0, *update_probability_matrix(row, col, b))
|
| 170 |
-
# )
|
| 171 |
-
# else:
|
| 172 |
-
# li.append((1.0, *update_probability_matrix(row, col, a)))
|
| 173 |
-
|
| 174 |
np.random.seed(42) # Set a seed for reproducibility
|
| 175 |
tile_probabilities = np.random.dirichlet(np.ones(4), size=(nrow, ncol))
|
| 176 |
|
| 177 |
def to_row_col(s):
|
| 178 |
return divmod(s, ncol)
|
| 179 |
|
| 180 |
-
|
| 181 |
-
# for col in range(ncol):
|
| 182 |
-
# s = to_s(row, col)
|
| 183 |
-
# for a in range(4):
|
| 184 |
-
# li = P[s][a]
|
| 185 |
-
# letter = desc[row, col]
|
| 186 |
-
# if letter in b"GH":
|
| 187 |
-
# li.append((1.0, s, 0, True))
|
| 188 |
-
# else:
|
| 189 |
-
# if is_slippery:
|
| 190 |
-
# # First, the agent moves in the desired direction
|
| 191 |
-
# newstate, reward, terminated = update_probability_matrix(row, col, a)
|
| 192 |
-
# if terminated:
|
| 193 |
-
# li.append((1.0, newstate, reward, terminated))
|
| 194 |
-
# else:
|
| 195 |
-
# # After the first move, slippery condition causes a random additional movement
|
| 196 |
-
# row2, col2 = to_row_col(newstate)
|
| 197 |
-
# for b in range(4):
|
| 198 |
-
# li.append(
|
| 199 |
-
# (1.0 / 4.0, *update_probability_matrix(row2, col2, b))
|
| 200 |
-
# )
|
| 201 |
-
# else:
|
| 202 |
-
# li.append((1.0, *update_probability_matrix(row, col, a)))
|
| 203 |
-
|
| 204 |
-
# for row in range(nrow):
|
| 205 |
-
# for col in range(ncol):
|
| 206 |
-
# s = to_s(row, col)
|
| 207 |
-
# for a in range(4):
|
| 208 |
-
# li = P[s][a]
|
| 209 |
-
# letter = desc[row, col]
|
| 210 |
-
# if letter in b"GH":
|
| 211 |
-
# li.append((1.0, s, 0, True))
|
| 212 |
-
# else:
|
| 213 |
-
# if is_slippery:
|
| 214 |
-
# # First, the agent moves in the desired direction
|
| 215 |
-
# newstate, reward, terminated = update_probability_matrix(row, col, a)
|
| 216 |
-
# if terminated:
|
| 217 |
-
# li.append((1.0, newstate, reward, terminated))
|
| 218 |
-
# else:
|
| 219 |
-
# # After the first move, slippery condition causes an additional movement
|
| 220 |
-
# row2, col2 = to_row_col(newstate)
|
| 221 |
-
# for b, prob in enumerate(tile_probabilities[row2, col2]):
|
| 222 |
-
# li.append(
|
| 223 |
-
# (prob, *update_probability_matrix(row2, col2, b))
|
| 224 |
-
# )
|
| 225 |
-
# else:
|
| 226 |
-
# li.append((1.0, *update_probability_matrix(row, col, a)))
|
| 227 |
-
|
| 228 |
-
base_slip_prob=0.3
|
| 229 |
|
| 230 |
for row in range(nrow):
|
| 231 |
for col in range(ncol):
|
|
@@ -245,40 +173,48 @@ class FrozenLakeEnv(discrete.DiscreteEnv):
|
|
| 245 |
# After the first move, slippery condition causes an additional movement
|
| 246 |
row2, col2 = to_row_col(newstate)
|
| 247 |
for b, prob in enumerate(tile_probabilities[row2, col2]):
|
|
|
|
| 248 |
li.append(
|
| 249 |
-
(base_slip_prob * prob,
|
| 250 |
)
|
| 251 |
# Add the remaining probability for staying at the newstate
|
| 252 |
li.append((1.0 - base_slip_prob, newstate, reward, False))
|
| 253 |
else:
|
| 254 |
li.append((1.0, *update_probability_matrix(row, col, a)))
|
|
|
|
|
|
|
| 255 |
super(FrozenLakeEnv, self).__init__(nS, nA, P, isd)
|
| 256 |
|
| 257 |
def get_post_decision_state(self, s, a):
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
def render(self, mode="human"):
|
| 284 |
outfile = StringIO() if mode == "ansi" else sys.stdout
|
|
|
|
| 130 |
break
|
| 131 |
if goal_position:
|
| 132 |
break
|
| 133 |
+
self.goal_position = goal_position
|
| 134 |
|
| 135 |
def proximity_reward(current_row, current_col):
|
| 136 |
goal_row, goal_col = goal_position
|
|
|
|
| 141 |
newrow, newcol = inc(row, col, a)
|
| 142 |
newstate = to_s(newrow, newcol)
|
| 143 |
newletter = desc[newrow, newcol]
|
| 144 |
+
terminated = bytes(newletter) in b"G"
|
| 145 |
reward = float(newletter == b"G")
|
| 146 |
if not terminated:
|
| 147 |
+
reward = proximity_reward(newrow, newcol) + float(newletter == b"H")* -(1/(nrow+ncol))
|
| 148 |
return newstate, reward, terminated
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
np.random.seed(42) # Set a seed for reproducibility
|
| 151 |
tile_probabilities = np.random.dirichlet(np.ones(4), size=(nrow, ncol))
|
| 152 |
|
| 153 |
def to_row_col(s):
|
| 154 |
return divmod(s, ncol)
|
| 155 |
|
| 156 |
+
base_slip_prob= 0.50
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
for row in range(nrow):
|
| 159 |
for col in range(ncol):
|
|
|
|
| 173 |
# After the first move, slippery condition causes an additional movement
|
| 174 |
row2, col2 = to_row_col(newstate)
|
| 175 |
for b, prob in enumerate(tile_probabilities[row2, col2]):
|
| 176 |
+
newstate_post, reward_pos, terminated_post = update_probability_matrix(row2, col2, b)
|
| 177 |
li.append(
|
| 178 |
+
(base_slip_prob * prob, newstate_post, reward_pos + reward, terminated_post)
|
| 179 |
)
|
| 180 |
# Add the remaining probability for staying at the newstate
|
| 181 |
li.append((1.0 - base_slip_prob, newstate, reward, False))
|
| 182 |
else:
|
| 183 |
li.append((1.0, *update_probability_matrix(row, col, a)))
|
| 184 |
+
|
| 185 |
+
self.P = P
|
| 186 |
super(FrozenLakeEnv, self).__init__(nS, nA, P, isd)
|
| 187 |
|
| 188 |
def get_post_decision_state(self, s, a):
|
| 189 |
+
def proximity_reward(current_row, current_col):
|
| 190 |
+
goal_row, goal_col = self.goal_position
|
| 191 |
+
distance = abs(goal_row - current_row) + abs(goal_col - current_col)
|
| 192 |
+
return 1.0 / (1.0 + distance)
|
| 193 |
+
|
| 194 |
+
def inc(row, col, a):
|
| 195 |
+
if a == LEFT:
|
| 196 |
+
col = max(col - 1, 0)
|
| 197 |
+
elif a == DOWN:
|
| 198 |
+
row = min(row + 1, self.nrow - 1)
|
| 199 |
+
elif a == RIGHT:
|
| 200 |
+
col = min(col + 1, self.ncol - 1)
|
| 201 |
+
elif a == UP:
|
| 202 |
+
row = max(row - 1, 0)
|
| 203 |
+
return (row, col)
|
| 204 |
+
|
| 205 |
+
def to_s(row, col):
|
| 206 |
+
return row * self.ncol + col
|
| 207 |
+
|
| 208 |
+
def to_row_col(s):
|
| 209 |
+
row = s // self.ncol
|
| 210 |
+
col = s % self.ncol
|
| 211 |
+
return row, col
|
| 212 |
+
|
| 213 |
+
row, col = to_row_col(s)
|
| 214 |
+
next_row, next_col = inc(row, col, a)
|
| 215 |
+
next_s = to_s(next_row, next_col)
|
| 216 |
+
next_r = proximity_reward(next_row, next_col)
|
| 217 |
+
return next_s, next_r
|
| 218 |
|
| 219 |
def render(self, mode="human"):
|
| 220 |
outfile = StringIO() if mode == "ansi" else sys.stdout
|