Skip to content

Commit b76b5cc

Browse files
update
1 parent 666e520 commit b76b5cc

1 file changed

Lines changed: 49 additions & 273 deletions

File tree

notebooks/geopandas_flood_frequency.ipynb

Lines changed: 49 additions & 273 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
},
4545
{
4646
"cell_type": "code",
47-
"execution_count": 1,
47+
"execution_count": 31,
4848
"id": "cell-3",
4949
"metadata": {
5050
"id": "cell-3"
@@ -58,7 +58,10 @@
5858
"from shapely import STRtree\n",
5959
"import matplotlib.pyplot as plt\n",
6060
"import matplotlib.colors as mcolors\n",
61-
"import numpy as np"
61+
"import numpy as np\n",
62+
"from scipy.sparse import coo_matrix\n",
63+
"from scipy.sparse.csgraph import connected_components\n",
64+
"from shapely import STRtree"
6265
]
6366
},
6467
{
@@ -702,297 +705,60 @@
702705
},
703706
{
704707
"cell_type": "code",
705-
"execution_count": 16,
708+
"execution_count": 27,
706709
"id": "e872uw2n33q",
707710
"metadata": {
708-
"id": "e872uw2n33q"
711+
"id": "e872uw2n33q",
712+
"colab": {
713+
"base_uri": "https://localhost:8080/"
714+
},
715+
"outputId": "424cf011-6373-4785-fda6-a8a8d9a138ea"
709716
},
710-
"outputs": [],
717+
"outputs": [
718+
{
719+
"output_type": "stream",
720+
"name": "stdout",
721+
"text": [
722+
"Filtered to 659,052, which is significantly more memory efficient.\n"
723+
]
724+
}
725+
],
711726
"source": [
712-
"# This step is computationally expensive and may take a few minutes\n",
727+
"# This step is computationally intensive and takes a few minutes.\n",
713728
"\n",
714-
"# Work on a clean positional index so iloc lookups are safe\n",
729+
"# Work on a clean positional index\n",
715730
"gdf_flood = gdf_filtered.reset_index(drop=True)\n",
716-
"dates = pd.to_datetime(gdf_flood['start_date']).values # numpy datetime64 array\n",
731+
"dates = pd.to_datetime(gdf_flood['start_date']).values\n",
732+
"ndays = 7\n",
717733
"\n",
718-
"# Bulk spatial query: returns two arrays (query_geom_index, tree_geom_index)\n",
719-
"# for every intersecting pair across the entire dataset\n",
734+
"# Use spatial index to find candidate pairs\n",
720735
"tree = STRtree(gdf_flood.geometry.values)\n",
721736
"left_idx, right_idx = tree.query(gdf_flood.geometry.values, predicate='intersects')\n",
722737
"\n",
723-
"# Keep only unique pairs (left < right removes self-matches and duplicates)\n",
738+
"# Filter out self-intersections and redundant pairs (a,b vs b,a)\n",
724739
"mask = left_idx < right_idx\n",
725740
"left_idx = left_idx[mask]\n",
726741
"right_idx = right_idx[mask]\n",
727742
"\n",
728-
"# Compute date difference in days for every spatial pair\n",
743+
"# Compute date differences using NumPy to avoid creating a large DataFrame\n",
744+
"# If you run out of memory at this step, you can do this in chunks\n",
729745
"date_diff_days = np.abs(\n",
730746
" (dates[left_idx] - dates[right_idx]).astype('timedelta64[D]').astype(int)\n",
731747
")\n",
732748
"\n",
733-
"# Assemble a dataframe of all spatially intersecting pairs\n",
734-
"pairs = pd.DataFrame({\n",
735-
" 'idx_a': left_idx,\n",
736-
" 'idx_b': right_idx,\n",
737-
" 'uuid_a': gdf_flood['uuid'].iloc[left_idx].values,\n",
738-
" 'uuid_b': gdf_flood['uuid'].iloc[right_idx].values,\n",
739-
" 'date_a': gdf_flood['start_date'].iloc[left_idx].values,\n",
740-
" 'date_b': gdf_flood['start_date'].iloc[right_idx].values,\n",
741-
" 'date_diff_days': date_diff_days,\n",
742-
"})\n",
743-
"\n",
744-
"ndays = 7 # This will be our threshold for \"nearby\" dates in the next step\n",
749+
"# Filter indices based on the date threshold\n",
750+
"time_mask = date_diff_days <= ndays\n",
751+
"nearby_left = left_idx[time_mask]\n",
752+
"nearby_right = right_idx[time_mask]\n",
745753
"\n",
746-
"# Filter to pairs whose dates are within ndays of each other\n",
747-
"nearby = pairs[pairs['date_diff_days'] <= ndays].reset_index(drop=True)"
748-
]
749-
},
750-
{
751-
"cell_type": "code",
752-
"source": [
753-
"print(f'Intersecting polygon pairs with dates within ±{ndays} days: {len(nearby):>10,}')\n",
754-
"nearby.loc[:5, ['uuid_a', 'uuid_b', 'date_a', 'date_b']]"
755-
],
756-
"metadata": {
757-
"colab": {
758-
"base_uri": "https://localhost:8080/",
759-
"height": 255
760-
},
761-
"id": "6cjv4TsaBl_K",
762-
"outputId": "955e2f6b-4c89-4ac2-b463-859191844dc0"
763-
},
764-
"id": "6cjv4TsaBl_K",
765-
"execution_count": 17,
766-
"outputs": [
767-
{
768-
"output_type": "stream",
769-
"name": "stdout",
770-
"text": [
771-
"Intersecting polygon pairs with dates within ±7 days: 659,052\n"
772-
]
773-
},
774-
{
775-
"output_type": "execute_result",
776-
"data": {
777-
"text/plain": [
778-
" uuid_a uuid_b \\\n",
779-
"0 d274bd96994a45e4b1260c873c37f68a fbbc8e9a2dfa4051aa34b04c58b86424 \n",
780-
"1 d274bd96994a45e4b1260c873c37f68a 956e80a7f5c34de3950724210c39cd31 \n",
781-
"2 d274bd96994a45e4b1260c873c37f68a 322a35b0600f45bd9cad635331b479c8 \n",
782-
"3 d274bd96994a45e4b1260c873c37f68a 06dddcc22a5d451cb0215f3412f1a6c6 \n",
783-
"4 d274bd96994a45e4b1260c873c37f68a 61b73731228441a9af8c11d2ff33f979 \n",
784-
"5 d274bd96994a45e4b1260c873c37f68a b41c89ed5ada41a7949e6b03713e4a67 \n",
785-
"\n",
786-
" date_a date_b \n",
787-
"0 2000-08-23 2000-08-23 \n",
788-
"1 2000-08-23 2000-08-23 \n",
789-
"2 2000-08-23 2000-08-26 \n",
790-
"3 2000-08-23 2000-08-26 \n",
791-
"4 2000-08-23 2000-08-23 \n",
792-
"5 2000-08-23 2000-08-26 "
793-
],
794-
"text/html": [
795-
"\n",
796-
" <div id=\"df-74352ace-f3fd-4d39-ab9b-a74ad7735047\" class=\"colab-df-container\">\n",
797-
" <div>\n",
798-
"<style scoped>\n",
799-
" .dataframe tbody tr th:only-of-type {\n",
800-
" vertical-align: middle;\n",
801-
" }\n",
802-
"\n",
803-
" .dataframe tbody tr th {\n",
804-
" vertical-align: top;\n",
805-
" }\n",
806-
"\n",
807-
" .dataframe thead th {\n",
808-
" text-align: right;\n",
809-
" }\n",
810-
"</style>\n",
811-
"<table border=\"1\" class=\"dataframe\">\n",
812-
" <thead>\n",
813-
" <tr style=\"text-align: right;\">\n",
814-
" <th></th>\n",
815-
" <th>uuid_a</th>\n",
816-
" <th>uuid_b</th>\n",
817-
" <th>date_a</th>\n",
818-
" <th>date_b</th>\n",
819-
" </tr>\n",
820-
" </thead>\n",
821-
" <tbody>\n",
822-
" <tr>\n",
823-
" <th>0</th>\n",
824-
" <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
825-
" <td>fbbc8e9a2dfa4051aa34b04c58b86424</td>\n",
826-
" <td>2000-08-23</td>\n",
827-
" <td>2000-08-23</td>\n",
828-
" </tr>\n",
829-
" <tr>\n",
830-
" <th>1</th>\n",
831-
" <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
832-
" <td>956e80a7f5c34de3950724210c39cd31</td>\n",
833-
" <td>2000-08-23</td>\n",
834-
" <td>2000-08-23</td>\n",
835-
" </tr>\n",
836-
" <tr>\n",
837-
" <th>2</th>\n",
838-
" <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
839-
" <td>322a35b0600f45bd9cad635331b479c8</td>\n",
840-
" <td>2000-08-23</td>\n",
841-
" <td>2000-08-26</td>\n",
842-
" </tr>\n",
843-
" <tr>\n",
844-
" <th>3</th>\n",
845-
" <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
846-
" <td>06dddcc22a5d451cb0215f3412f1a6c6</td>\n",
847-
" <td>2000-08-23</td>\n",
848-
" <td>2000-08-26</td>\n",
849-
" </tr>\n",
850-
" <tr>\n",
851-
" <th>4</th>\n",
852-
" <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
853-
" <td>61b73731228441a9af8c11d2ff33f979</td>\n",
854-
" <td>2000-08-23</td>\n",
855-
" <td>2000-08-23</td>\n",
856-
" </tr>\n",
857-
" <tr>\n",
858-
" <th>5</th>\n",
859-
" <td>d274bd96994a45e4b1260c873c37f68a</td>\n",
860-
" <td>b41c89ed5ada41a7949e6b03713e4a67</td>\n",
861-
" <td>2000-08-23</td>\n",
862-
" <td>2000-08-26</td>\n",
863-
" </tr>\n",
864-
" </tbody>\n",
865-
"</table>\n",
866-
"</div>\n",
867-
" <div class=\"colab-df-buttons\">\n",
868-
"\n",
869-
" <div class=\"colab-df-container\">\n",
870-
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-74352ace-f3fd-4d39-ab9b-a74ad7735047')\"\n",
871-
" title=\"Convert this dataframe to an interactive table.\"\n",
872-
" style=\"display:none;\">\n",
873-
"\n",
874-
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
875-
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
876-
" </svg>\n",
877-
" </button>\n",
878-
"\n",
879-
" <style>\n",
880-
" .colab-df-container {\n",
881-
" display:flex;\n",
882-
" gap: 12px;\n",
883-
" }\n",
884-
"\n",
885-
" .colab-df-convert {\n",
886-
" background-color: #E8F0FE;\n",
887-
" border: none;\n",
888-
" border-radius: 50%;\n",
889-
" cursor: pointer;\n",
890-
" display: none;\n",
891-
" fill: #1967D2;\n",
892-
" height: 32px;\n",
893-
" padding: 0 0 0 0;\n",
894-
" width: 32px;\n",
895-
" }\n",
896-
"\n",
897-
" .colab-df-convert:hover {\n",
898-
" background-color: #E2EBFA;\n",
899-
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
900-
" fill: #174EA6;\n",
901-
" }\n",
902-
"\n",
903-
" .colab-df-buttons div {\n",
904-
" margin-bottom: 4px;\n",
905-
" }\n",
906-
"\n",
907-
" [theme=dark] .colab-df-convert {\n",
908-
" background-color: #3B4455;\n",
909-
" fill: #D2E3FC;\n",
910-
" }\n",
911-
"\n",
912-
" [theme=dark] .colab-df-convert:hover {\n",
913-
" background-color: #434B5C;\n",
914-
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
915-
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
916-
" fill: #FFFFFF;\n",
917-
" }\n",
918-
" </style>\n",
919-
"\n",
920-
" <script>\n",
921-
" const buttonEl =\n",
922-
" document.querySelector('#df-74352ace-f3fd-4d39-ab9b-a74ad7735047 button.colab-df-convert');\n",
923-
" buttonEl.style.display =\n",
924-
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
925-
"\n",
926-
" async function convertToInteractive(key) {\n",
927-
" const element = document.querySelector('#df-74352ace-f3fd-4d39-ab9b-a74ad7735047');\n",
928-
" const dataTable =\n",
929-
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
930-
" [key], {});\n",
931-
" if (!dataTable) return;\n",
932-
"\n",
933-
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
934-
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
935-
" + ' to learn more about interactive tables.';\n",
936-
" element.innerHTML = '';\n",
937-
" dataTable['output_type'] = 'display_data';\n",
938-
" await google.colab.output.renderOutput(dataTable, element);\n",
939-
" const docLink = document.createElement('div');\n",
940-
" docLink.innerHTML = docLinkHtml;\n",
941-
" element.appendChild(docLink);\n",
942-
" }\n",
943-
" </script>\n",
944-
" </div>\n",
945-
"\n",
946-
"\n",
947-
" </div>\n",
948-
" </div>\n"
949-
],
950-
"application/vnd.google.colaboratory.intrinsic+json": {
951-
"type": "dataframe",
952-
"summary": "{\n \"name\": \"nearby\",\n \"rows\": 6,\n \"fields\": [\n {\n \"column\": \"uuid_a\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"d274bd96994a45e4b1260c873c37f68a\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"uuid_b\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"fbbc8e9a2dfa4051aa34b04c58b86424\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"date_a\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"2000-08-23\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"date_b\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"2000-08-26\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
953-
}
954-
},
955-
"metadata": {},
956-
"execution_count": 17
957-
}
958-
]
959-
},
960-
{
961-
"cell_type": "markdown",
962-
"id": "6g7yt8huc06",
963-
"metadata": {
964-
"id": "6g7yt8huc06"
965-
},
966-
"source": [
967-
"Assigning a shared `flood_event` ID to groups of intersecting, temporally-close polygons is a **connected components** problem. If polygon A overlaps B and B overlaps C (both within 7 days), all three should share the same event ID. We build a sparse adjacency graph from the `nearby` pairs and use `scipy.sparse.csgraph.connected_components` to label every cluster. Isolated polygons (no nearby neighbours) each become their own single-member component."
968-
]
969-
},
970-
{
971-
"cell_type": "code",
972-
"execution_count": 18,
973-
"id": "mqi2zhq5h4",
974-
"metadata": {
975-
"colab": {
976-
"base_uri": "https://localhost:8080/"
977-
},
978-
"id": "mqi2zhq5h4",
979-
"outputId": "8791cecf-50c1-4ba2-bfff-cbb43de84ec6"
980-
},
981-
"outputs": [
982-
{
983-
"output_type": "stream",
984-
"name": "stdout",
985-
"text": [
986-
"Total polygons: 445,244\n",
987-
"Unique flood events: 154,446\n",
988-
"Avg polygons per event: 2.88\n"
989-
]
990-
}
991-
],
992-
"source": [
993-
"from scipy.sparse import coo_matrix\n",
994-
"from scipy.sparse.csgraph import connected_components\n",
754+
"# Now create a 'nearby' dataframe only for valid pairs\n",
755+
"nearby = pd.DataFrame({\n",
756+
" 'idx_a': nearby_left,\n",
757+
" 'idx_b': nearby_right\n",
758+
"})\n",
995759
"\n",
760+
"# Now use connected_components to assign a shared `flood_event` id to groups\n",
761+
"# of intersecting temporally-close polygons\n",
996762
"n = len(gdf_flood)\n",
997763
"\n",
998764
"# Build a symmetric sparse adjacency matrix from the nearby pairs\n",
@@ -1007,6 +773,16 @@
1007773
"print(f'Avg polygons per event: {n / n_components:>10.2f}')"
1008774
]
1009775
},
776+
{
777+
"cell_type": "markdown",
778+
"source": [
779+
"Add the labels to the main GeoDataFrame."
780+
],
781+
"metadata": {
782+
"id": "AX_rYadyWMBE"
783+
},
784+
"id": "AX_rYadyWMBE"
785+
},
1010786
{
1011787
"cell_type": "code",
1012788
"execution_count": 19,

0 commit comments

Comments
 (0)