Add standardization of the returns to obtain more stable training of Reinforce

2026-06-14 22:17:15 +08:00 · 2022-10-17 04:08:13 +02:00
parent c9b9e4810a
commit 7c06423ca4
1 changed files with 13 additions and 2 deletions
--- a/unit5/unit5.ipynb
+++ b/unit5/unit5.ipynb
@@ -673,8 +673,13 @@
    "        for t in range(n_steps)[::-1]:\n",
    "            disc_return_t = (returns[0] if len(returns)>0 else 0)\n",
    "            returns.appendleft(    ) # complete here            \n",
-    "        \n",
-    "\n",
+    "       \n",
+    "        ## standardization of the returns is employed to make training more stable\n",
+    "        eps = np.finfo(np.float32).eps.item()\n",
+    "        ## eps is the smallest representable float, which is \n",
+    "        # added to the standard deviation of the returns to avoid numerical instabilities\n",
+    "        returns = torch.tensor(returns)\n",
+    "        returns = (returns - returns.mean()) / (returns.std() + eps)\n",
    "        # Line 7:\n",
    "        policy_loss = []\n",
    "        for log_prob, disc_return in zip(saved_log_probs, returns):\n",
@@ -766,6 +771,12 @@
    "            disc_return_t = (returns[0] if len(returns)>0 else 0)\n",
    "            returns.appendleft( gamma*disc_return_t + rewards[t]   )    \n",
    "            \n",
+    "        ## standardization of the returns is employed to make training more stable\n",
+    "        eps = np.finfo(np.float32).eps.item()\n",
+    "        ## eps is the smallest representable float, which is \n",
+    "        # added to the standard deviation of the returns to avoid numerical instabilities        \n",
+    "        returns = torch.tensor(returns)\n",
+    "        returns = (returns - returns.mean()) / (returns.std() + eps)\n",
    "        # Line 7:\n",
    "        policy_loss = []\n",
    "        for log_prob, disc_return in zip(saved_log_probs, returns):\n",