update

spatialthoughts · spatialthoughts · commit b76b5cc92844 · 2026-03-28T13:00:34.000+05:30
diff --git a/notebooks/geopandas_flood_frequency.ipynb b/notebooks/geopandas_flood_frequency.ipynb
@@ -44,7 +44,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": 31,
       "id": "cell-3",
       "metadata": {
         "id": "cell-3"
@@ -58,7 +58,10 @@
         "from shapely import STRtree\n",
         "import matplotlib.pyplot as plt\n",
         "import matplotlib.colors as mcolors\n",
-        "import numpy as np"
+        "import numpy as np\n",
+        "from scipy.sparse import coo_matrix\n",
+        "from scipy.sparse.csgraph import connected_components\n",
+        "from shapely import STRtree"
       ]
     },
     {
@@ -702,297 +705,60 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 16,
+      "execution_count": 27,
       "id": "e872uw2n33q",
       "metadata": {
-        "id": "e872uw2n33q"
+        "id": "e872uw2n33q",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "424cf011-6373-4785-fda6-a8a8d9a138ea"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Filtered to 659,052, which is significantly more memory efficient.\n"
+          ]
+        }
+      ],
       "source": [
-        "# This step is computationally expensive and may take a few minutes\n",
+        "# This step is computationally intensive and takes a few minutes.\n",
         "\n",
-        "# Work on a clean positional index so iloc lookups are safe\n",
+        "# Work on a clean positional index\n",
         "gdf_flood = gdf_filtered.reset_index(drop=True)\n",
-        "dates = pd.to_datetime(gdf_flood['start_date']).values  # numpy datetime64 array\n",
+        "dates = pd.to_datetime(gdf_flood['start_date']).values\n",
+        "ndays = 7\n",
         "\n",
-        "# Bulk spatial query: returns two arrays (query_geom_index, tree_geom_index)\n",
-        "# for every intersecting pair across the entire dataset\n",
+        "# Use spatial index to find candidate pairs\n",
         "tree = STRtree(gdf_flood.geometry.values)\n",
         "left_idx, right_idx = tree.query(gdf_flood.geometry.values, predicate='intersects')\n",
         "\n",
-        "# Keep only unique pairs (left < right removes self-matches and duplicates)\n",
+        "# Filter out self-intersections and redundant pairs (a,b vs b,a)\n",
         "mask = left_idx < right_idx\n",
         "left_idx = left_idx[mask]\n",
         "right_idx = right_idx[mask]\n",
         "\n",
-        "# Compute date difference in days for every spatial pair\n",
+        "# Compute date differences using NumPy to avoid creating a large DataFrame\n",
+        "# If you run out of memory at this step, you can do this in chunks\n",
         "date_diff_days = np.abs(\n",
         "    (dates[left_idx] - dates[right_idx]).astype('timedelta64[D]').astype(int)\n",
         ")\n",
         "\n",
-        "# Assemble a dataframe of all spatially intersecting pairs\n",
-        "pairs = pd.DataFrame({\n",
-        "    'idx_a':          left_idx,\n",
-        "    'idx_b':          right_idx,\n",
-        "    'uuid_a':         gdf_flood['uuid'].iloc[left_idx].values,\n",
-        "    'uuid_b':         gdf_flood['uuid'].iloc[right_idx].values,\n",
-        "    'date_a':         gdf_flood['start_date'].iloc[left_idx].values,\n",
-        "    'date_b':         gdf_flood['start_date'].iloc[right_idx].values,\n",
-        "    'date_diff_days': date_diff_days,\n",
-        "})\n",
-        "\n",
-        "ndays = 7 # This will be our threshold for \"nearby\" dates in the next step\n",
+        "# Filter indices based on the date threshold\n",
+        "time_mask = date_diff_days <= ndays\n",
+        "nearby_left = left_idx[time_mask]\n",
+        "nearby_right = right_idx[time_mask]\n",
         "\n",
-        "# Filter to pairs whose dates are within ndays of each other\n",
-        "nearby = pairs[pairs['date_diff_days'] <= ndays].reset_index(drop=True)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "print(f'Intersecting polygon pairs with dates within ±{ndays} days: {len(nearby):>10,}')\n",
-        "nearby.loc[:5, ['uuid_a', 'uuid_b', 'date_a', 'date_b']]"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 255
-        },
-        "id": "6cjv4TsaBl_K",
-        "outputId": "955e2f6b-4c89-4ac2-b463-859191844dc0"
-      },
-      "id": "6cjv4TsaBl_K",
-      "execution_count": 17,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Intersecting polygon pairs with dates within ±7 days:    659,052\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "                             uuid_a                            uuid_b  \\\n",
-              "0  d274bd96994a45e4b1260c873c37f68a  fbbc8e9a2dfa4051aa34b04c58b86424   \n",
-              "1  d274bd96994a45e4b1260c873c37f68a  956e80a7f5c34de3950724210c39cd31   \n",
-              "2  d274bd96994a45e4b1260c873c37f68a  322a35b0600f45bd9cad635331b479c8   \n",
-              "3  d274bd96994a45e4b1260c873c37f68a  06dddcc22a5d451cb0215f3412f1a6c6   \n",
-              "4  d274bd96994a45e4b1260c873c37f68a  61b73731228441a9af8c11d2ff33f979   \n",
-              "5  d274bd96994a45e4b1260c873c37f68a  b41c89ed5ada41a7949e6b03713e4a67   \n",
-              "\n",
-              "       date_a      date_b  \n",
-              "0  2000-08-23  2000-08-23  \n",
-              "1  2000-08-23  2000-08-23  \n",
-              "2  2000-08-23  2000-08-26  \n",
-              "3  2000-08-23  2000-08-26  \n",
-              "4  2000-08-23  2000-08-23  \n",
-              "5  2000-08-23  2000-08-26  "
-            ],
-            "text/html": [
-              "\n",
-              "  <div id=\"df-74352ace-f3fd-4d39-ab9b-a74ad7735047\" class=\"colab-df-container\">\n",
-              "    <div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>uuid_a</th>\n",
-              "      <th>uuid_b</th>\n",
-              "      <th>date_a</th>\n",
-              "      <th>date_b</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
-              "      <td>fbbc8e9a2dfa4051aa34b04c58b86424</td>\n",
-              "      <td>2000-08-23</td>\n",
-              "      <td>2000-08-23</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
-              "      <td>956e80a7f5c34de3950724210c39cd31</td>\n",
-              "      <td>2000-08-23</td>\n",
-              "      <td>2000-08-23</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
-              "      <td>322a35b0600f45bd9cad635331b479c8</td>\n",
-              "      <td>2000-08-23</td>\n",
-              "      <td>2000-08-26</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
-              "      <td>06dddcc22a5d451cb0215f3412f1a6c6</td>\n",
-              "      <td>2000-08-23</td>\n",
-              "      <td>2000-08-26</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
-              "      <td>61b73731228441a9af8c11d2ff33f979</td>\n",
-              "      <td>2000-08-23</td>\n",
-              "      <td>2000-08-23</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
-              "      <td>b41c89ed5ada41a7949e6b03713e4a67</td>\n",
-              "      <td>2000-08-23</td>\n",
-              "      <td>2000-08-26</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>\n",
-              "    <div class=\"colab-df-buttons\">\n",
-              "\n",
-              "  <div class=\"colab-df-container\">\n",
-              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-74352ace-f3fd-4d39-ab9b-a74ad7735047')\"\n",
-              "            title=\"Convert this dataframe to an interactive table.\"\n",
-              "            style=\"display:none;\">\n",
-              "\n",
-              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
-              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
-              "  </svg>\n",
-              "    </button>\n",
-              "\n",
-              "  <style>\n",
-              "    .colab-df-container {\n",
-              "      display:flex;\n",
-              "      gap: 12px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert {\n",
-              "      background-color: #E8F0FE;\n",
-              "      border: none;\n",
-              "      border-radius: 50%;\n",
-              "      cursor: pointer;\n",
-              "      display: none;\n",
-              "      fill: #1967D2;\n",
-              "      height: 32px;\n",
-              "      padding: 0 0 0 0;\n",
-              "      width: 32px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert:hover {\n",
-              "      background-color: #E2EBFA;\n",
-              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "      fill: #174EA6;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-buttons div {\n",
-              "      margin-bottom: 4px;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert {\n",
-              "      background-color: #3B4455;\n",
-              "      fill: #D2E3FC;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert:hover {\n",
-              "      background-color: #434B5C;\n",
-              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
-              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
-              "      fill: #FFFFFF;\n",
-              "    }\n",
-              "  </style>\n",
-              "\n",
-              "    <script>\n",
-              "      const buttonEl =\n",
-              "        document.querySelector('#df-74352ace-f3fd-4d39-ab9b-a74ad7735047 button.colab-df-convert');\n",
-              "      buttonEl.style.display =\n",
-              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "\n",
-              "      async function convertToInteractive(key) {\n",
-              "        const element = document.querySelector('#df-74352ace-f3fd-4d39-ab9b-a74ad7735047');\n",
-              "        const dataTable =\n",
-              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
-              "                                                    [key], {});\n",
-              "        if (!dataTable) return;\n",
-              "\n",
-              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
-              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
-              "          + ' to learn more about interactive tables.';\n",
-              "        element.innerHTML = '';\n",
-              "        dataTable['output_type'] = 'display_data';\n",
-              "        await google.colab.output.renderOutput(dataTable, element);\n",
-              "        const docLink = document.createElement('div');\n",
-              "        docLink.innerHTML = docLinkHtml;\n",
-              "        element.appendChild(docLink);\n",
-              "      }\n",
-              "    </script>\n",
-              "  </div>\n",
-              "\n",
-              "\n",
-              "    </div>\n",
-              "  </div>\n"
-            ],
-            "application/vnd.google.colaboratory.intrinsic+json": {
-              "type": "dataframe",
-              "summary": "{\n  \"name\": \"nearby\",\n  \"rows\": 6,\n  \"fields\": [\n    {\n      \"column\": \"uuid_a\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 1,\n        \"samples\": [\n          \"d274bd96994a45e4b1260c873c37f68a\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"uuid_b\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"fbbc8e9a2dfa4051aa34b04c58b86424\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"date_a\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"num_unique_values\": 1,\n        \"samples\": [\n          \"2000-08-23\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"date_b\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"2000-08-26\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
-            }
-          },
-          "metadata": {},
-          "execution_count": 17
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "6g7yt8huc06",
-      "metadata": {
-        "id": "6g7yt8huc06"
-      },
-      "source": [
-        "Assigning a shared `flood_event` ID to groups of intersecting, temporally-close polygons is a **connected components** problem. If polygon A overlaps B and B overlaps C (both within 7 days), all three should share the same event ID. We build a sparse adjacency graph from the `nearby` pairs and use `scipy.sparse.csgraph.connected_components` to label every cluster. Isolated polygons (no nearby neighbours) each become their own single-member component."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 18,
-      "id": "mqi2zhq5h4",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "mqi2zhq5h4",
-        "outputId": "8791cecf-50c1-4ba2-bfff-cbb43de84ec6"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Total polygons:              445,244\n",
-            "Unique flood events:         154,446\n",
-            "Avg polygons per event:         2.88\n"
-          ]
-        }
-      ],
-      "source": [
-        "from scipy.sparse import coo_matrix\n",
-        "from scipy.sparse.csgraph import connected_components\n",
+        "# Now create a 'nearby' dataframe only for valid pairs\n",
+        "nearby = pd.DataFrame({\n",
+        "    'idx_a': nearby_left,\n",
+        "    'idx_b': nearby_right\n",
+        "})\n",
         "\n",
+        "# Now use connected_components to assign a shared `flood_event` id to groups\n",
+        "# of intersecting temporally-close polygons\n",
         "n = len(gdf_flood)\n",
         "\n",
         "# Build a symmetric sparse adjacency matrix from the nearby pairs\n",
@@ -1007,6 +773,16 @@
         "print(f'Avg polygons per event:   {n / n_components:>10.2f}')"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Add the labels to the main GeoDataFrame."
+      ],
+      "metadata": {
+        "id": "AX_rYadyWMBE"
+      },
+      "id": "AX_rYadyWMBE"
+    },
     {
       "cell_type": "code",
       "execution_count": 19,