|
44 | 44 | }, |
45 | 45 | { |
46 | 46 | "cell_type": "code", |
47 | | - "execution_count": 1, |
| 47 | + "execution_count": 31, |
48 | 48 | "id": "cell-3", |
49 | 49 | "metadata": { |
50 | 50 | "id": "cell-3" |
|
58 | 58 | "from shapely import STRtree\n", |
59 | 59 | "import matplotlib.pyplot as plt\n", |
60 | 60 | "import matplotlib.colors as mcolors\n", |
61 | | - "import numpy as np" |
| 61 | + "import numpy as np\n", |
| 62 | + "from scipy.sparse import coo_matrix\n", |
| 63 | + "from scipy.sparse.csgraph import connected_components\n", |
| 64 | + "from shapely import STRtree" |
62 | 65 | ] |
63 | 66 | }, |
64 | 67 | { |
|
702 | 705 | }, |
703 | 706 | { |
704 | 707 | "cell_type": "code", |
705 | | - "execution_count": 16, |
| 708 | + "execution_count": 27, |
706 | 709 | "id": "e872uw2n33q", |
707 | 710 | "metadata": { |
708 | | - "id": "e872uw2n33q" |
| 711 | + "id": "e872uw2n33q", |
| 712 | + "colab": { |
| 713 | + "base_uri": "https://localhost:8080/" |
| 714 | + }, |
| 715 | + "outputId": "424cf011-6373-4785-fda6-a8a8d9a138ea" |
709 | 716 | }, |
710 | | - "outputs": [], |
| 717 | + "outputs": [ |
| 718 | + { |
| 719 | + "output_type": "stream", |
| 720 | + "name": "stdout", |
| 721 | + "text": [ |
| 722 | + "Filtered to 659,052, which is significantly more memory efficient.\n" |
| 723 | + ] |
| 724 | + } |
| 725 | + ], |
711 | 726 | "source": [ |
712 | | - "# This step is computationally expensive and may take a few minutes\n", |
| 727 | + "# This step is computationally intensive and takes a few minutes.\n", |
713 | 728 | "\n", |
714 | | - "# Work on a clean positional index so iloc lookups are safe\n", |
| 729 | + "# Work on a clean positional index\n", |
715 | 730 | "gdf_flood = gdf_filtered.reset_index(drop=True)\n", |
716 | | - "dates = pd.to_datetime(gdf_flood['start_date']).values # numpy datetime64 array\n", |
| 731 | + "dates = pd.to_datetime(gdf_flood['start_date']).values\n", |
| 732 | + "ndays = 7\n", |
717 | 733 | "\n", |
718 | | - "# Bulk spatial query: returns two arrays (query_geom_index, tree_geom_index)\n", |
719 | | - "# for every intersecting pair across the entire dataset\n", |
| 734 | + "# Use spatial index to find candidate pairs\n", |
720 | 735 | "tree = STRtree(gdf_flood.geometry.values)\n", |
721 | 736 | "left_idx, right_idx = tree.query(gdf_flood.geometry.values, predicate='intersects')\n", |
722 | 737 | "\n", |
723 | | - "# Keep only unique pairs (left < right removes self-matches and duplicates)\n", |
| 738 | + "# Filter out self-intersections and redundant pairs (a,b vs b,a)\n", |
724 | 739 | "mask = left_idx < right_idx\n", |
725 | 740 | "left_idx = left_idx[mask]\n", |
726 | 741 | "right_idx = right_idx[mask]\n", |
727 | 742 | "\n", |
728 | | - "# Compute date difference in days for every spatial pair\n", |
| 743 | + "# Compute date differences using NumPy to avoid creating a large DataFrame\n", |
| 744 | + "# If you run out of memory at this step, you can do this in chunks\n", |
729 | 745 | "date_diff_days = np.abs(\n", |
730 | 746 | " (dates[left_idx] - dates[right_idx]).astype('timedelta64[D]').astype(int)\n", |
731 | 747 | ")\n", |
732 | 748 | "\n", |
733 | | - "# Assemble a dataframe of all spatially intersecting pairs\n", |
734 | | - "pairs = pd.DataFrame({\n", |
735 | | - " 'idx_a': left_idx,\n", |
736 | | - " 'idx_b': right_idx,\n", |
737 | | - " 'uuid_a': gdf_flood['uuid'].iloc[left_idx].values,\n", |
738 | | - " 'uuid_b': gdf_flood['uuid'].iloc[right_idx].values,\n", |
739 | | - " 'date_a': gdf_flood['start_date'].iloc[left_idx].values,\n", |
740 | | - " 'date_b': gdf_flood['start_date'].iloc[right_idx].values,\n", |
741 | | - " 'date_diff_days': date_diff_days,\n", |
742 | | - "})\n", |
743 | | - "\n", |
744 | | - "ndays = 7 # This will be our threshold for \"nearby\" dates in the next step\n", |
| 749 | + "# Filter indices based on the date threshold\n", |
| 750 | + "time_mask = date_diff_days <= ndays\n", |
| 751 | + "nearby_left = left_idx[time_mask]\n", |
| 752 | + "nearby_right = right_idx[time_mask]\n", |
745 | 753 | "\n", |
746 | | - "# Filter to pairs whose dates are within ndays of each other\n", |
747 | | - "nearby = pairs[pairs['date_diff_days'] <= ndays].reset_index(drop=True)" |
748 | | - ] |
749 | | - }, |
750 | | - { |
751 | | - "cell_type": "code", |
752 | | - "source": [ |
753 | | - "print(f'Intersecting polygon pairs with dates within ±{ndays} days: {len(nearby):>10,}')\n", |
754 | | - "nearby.loc[:5, ['uuid_a', 'uuid_b', 'date_a', 'date_b']]" |
755 | | - ], |
756 | | - "metadata": { |
757 | | - "colab": { |
758 | | - "base_uri": "https://localhost:8080/", |
759 | | - "height": 255 |
760 | | - }, |
761 | | - "id": "6cjv4TsaBl_K", |
762 | | - "outputId": "955e2f6b-4c89-4ac2-b463-859191844dc0" |
763 | | - }, |
764 | | - "id": "6cjv4TsaBl_K", |
765 | | - "execution_count": 17, |
766 | | - "outputs": [ |
767 | | - { |
768 | | - "output_type": "stream", |
769 | | - "name": "stdout", |
770 | | - "text": [ |
771 | | - "Intersecting polygon pairs with dates within ±7 days: 659,052\n" |
772 | | - ] |
773 | | - }, |
774 | | - { |
775 | | - "output_type": "execute_result", |
776 | | - "data": { |
777 | | - "text/plain": [ |
778 | | - " uuid_a uuid_b \\\n", |
779 | | - "0 d274bd96994a45e4b1260c873c37f68a fbbc8e9a2dfa4051aa34b04c58b86424 \n", |
780 | | - "1 d274bd96994a45e4b1260c873c37f68a 956e80a7f5c34de3950724210c39cd31 \n", |
781 | | - "2 d274bd96994a45e4b1260c873c37f68a 322a35b0600f45bd9cad635331b479c8 \n", |
782 | | - "3 d274bd96994a45e4b1260c873c37f68a 06dddcc22a5d451cb0215f3412f1a6c6 \n", |
783 | | - "4 d274bd96994a45e4b1260c873c37f68a 61b73731228441a9af8c11d2ff33f979 \n", |
784 | | - "5 d274bd96994a45e4b1260c873c37f68a b41c89ed5ada41a7949e6b03713e4a67 \n", |
785 | | - "\n", |
786 | | - " date_a date_b \n", |
787 | | - "0 2000-08-23 2000-08-23 \n", |
788 | | - "1 2000-08-23 2000-08-23 \n", |
789 | | - "2 2000-08-23 2000-08-26 \n", |
790 | | - "3 2000-08-23 2000-08-26 \n", |
791 | | - "4 2000-08-23 2000-08-23 \n", |
792 | | - "5 2000-08-23 2000-08-26 " |
793 | | - ], |
794 | | - "text/html": [ |
795 | | - "\n", |
796 | | - " <div id=\"df-74352ace-f3fd-4d39-ab9b-a74ad7735047\" class=\"colab-df-container\">\n", |
797 | | - " <div>\n", |
798 | | - "<style scoped>\n", |
799 | | - " .dataframe tbody tr th:only-of-type {\n", |
800 | | - " vertical-align: middle;\n", |
801 | | - " }\n", |
802 | | - "\n", |
803 | | - " .dataframe tbody tr th {\n", |
804 | | - " vertical-align: top;\n", |
805 | | - " }\n", |
806 | | - "\n", |
807 | | - " .dataframe thead th {\n", |
808 | | - " text-align: right;\n", |
809 | | - " }\n", |
810 | | - "</style>\n", |
811 | | - "<table border=\"1\" class=\"dataframe\">\n", |
812 | | - " <thead>\n", |
813 | | - " <tr style=\"text-align: right;\">\n", |
814 | | - " <th></th>\n", |
815 | | - " <th>uuid_a</th>\n", |
816 | | - " <th>uuid_b</th>\n", |
817 | | - " <th>date_a</th>\n", |
818 | | - " <th>date_b</th>\n", |
819 | | - " </tr>\n", |
820 | | - " </thead>\n", |
821 | | - " <tbody>\n", |
822 | | - " <tr>\n", |
823 | | - " <th>0</th>\n", |
824 | | - " <td>d274bd96994a45e4b1260c873c37f68a</td>\n", |
825 | | - " <td>fbbc8e9a2dfa4051aa34b04c58b86424</td>\n", |
826 | | - " <td>2000-08-23</td>\n", |
827 | | - " <td>2000-08-23</td>\n", |
828 | | - " </tr>\n", |
829 | | - " <tr>\n", |
830 | | - " <th>1</th>\n", |
831 | | - " <td>d274bd96994a45e4b1260c873c37f68a</td>\n", |
832 | | - " <td>956e80a7f5c34de3950724210c39cd31</td>\n", |
833 | | - " <td>2000-08-23</td>\n", |
834 | | - " <td>2000-08-23</td>\n", |
835 | | - " </tr>\n", |
836 | | - " <tr>\n", |
837 | | - " <th>2</th>\n", |
838 | | - " <td>d274bd96994a45e4b1260c873c37f68a</td>\n", |
839 | | - " <td>322a35b0600f45bd9cad635331b479c8</td>\n", |
840 | | - " <td>2000-08-23</td>\n", |
841 | | - " <td>2000-08-26</td>\n", |
842 | | - " </tr>\n", |
843 | | - " <tr>\n", |
844 | | - " <th>3</th>\n", |
845 | | - " <td>d274bd96994a45e4b1260c873c37f68a</td>\n", |
846 | | - " <td>06dddcc22a5d451cb0215f3412f1a6c6</td>\n", |
847 | | - " <td>2000-08-23</td>\n", |
848 | | - " <td>2000-08-26</td>\n", |
849 | | - " </tr>\n", |
850 | | - " <tr>\n", |
851 | | - " <th>4</th>\n", |
852 | | - " <td>d274bd96994a45e4b1260c873c37f68a</td>\n", |
853 | | - " <td>61b73731228441a9af8c11d2ff33f979</td>\n", |
854 | | - " <td>2000-08-23</td>\n", |
855 | | - " <td>2000-08-23</td>\n", |
856 | | - " </tr>\n", |
857 | | - " <tr>\n", |
858 | | - " <th>5</th>\n", |
859 | | - " <td>d274bd96994a45e4b1260c873c37f68a</td>\n", |
860 | | - " <td>b41c89ed5ada41a7949e6b03713e4a67</td>\n", |
861 | | - " <td>2000-08-23</td>\n", |
862 | | - " <td>2000-08-26</td>\n", |
863 | | - " </tr>\n", |
864 | | - " </tbody>\n", |
865 | | - "</table>\n", |
866 | | - "</div>\n", |
867 | | - " <div class=\"colab-df-buttons\">\n", |
868 | | - "\n", |
869 | | - " <div class=\"colab-df-container\">\n", |
870 | | - " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-74352ace-f3fd-4d39-ab9b-a74ad7735047')\"\n", |
871 | | - " title=\"Convert this dataframe to an interactive table.\"\n", |
872 | | - " style=\"display:none;\">\n", |
873 | | - "\n", |
874 | | - " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", |
875 | | - " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", |
876 | | - " </svg>\n", |
877 | | - " </button>\n", |
878 | | - "\n", |
879 | | - " <style>\n", |
880 | | - " .colab-df-container {\n", |
881 | | - " display:flex;\n", |
882 | | - " gap: 12px;\n", |
883 | | - " }\n", |
884 | | - "\n", |
885 | | - " .colab-df-convert {\n", |
886 | | - " background-color: #E8F0FE;\n", |
887 | | - " border: none;\n", |
888 | | - " border-radius: 50%;\n", |
889 | | - " cursor: pointer;\n", |
890 | | - " display: none;\n", |
891 | | - " fill: #1967D2;\n", |
892 | | - " height: 32px;\n", |
893 | | - " padding: 0 0 0 0;\n", |
894 | | - " width: 32px;\n", |
895 | | - " }\n", |
896 | | - "\n", |
897 | | - " .colab-df-convert:hover {\n", |
898 | | - " background-color: #E2EBFA;\n", |
899 | | - " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", |
900 | | - " fill: #174EA6;\n", |
901 | | - " }\n", |
902 | | - "\n", |
903 | | - " .colab-df-buttons div {\n", |
904 | | - " margin-bottom: 4px;\n", |
905 | | - " }\n", |
906 | | - "\n", |
907 | | - " [theme=dark] .colab-df-convert {\n", |
908 | | - " background-color: #3B4455;\n", |
909 | | - " fill: #D2E3FC;\n", |
910 | | - " }\n", |
911 | | - "\n", |
912 | | - " [theme=dark] .colab-df-convert:hover {\n", |
913 | | - " background-color: #434B5C;\n", |
914 | | - " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", |
915 | | - " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", |
916 | | - " fill: #FFFFFF;\n", |
917 | | - " }\n", |
918 | | - " </style>\n", |
919 | | - "\n", |
920 | | - " <script>\n", |
921 | | - " const buttonEl =\n", |
922 | | - " document.querySelector('#df-74352ace-f3fd-4d39-ab9b-a74ad7735047 button.colab-df-convert');\n", |
923 | | - " buttonEl.style.display =\n", |
924 | | - " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", |
925 | | - "\n", |
926 | | - " async function convertToInteractive(key) {\n", |
927 | | - " const element = document.querySelector('#df-74352ace-f3fd-4d39-ab9b-a74ad7735047');\n", |
928 | | - " const dataTable =\n", |
929 | | - " await google.colab.kernel.invokeFunction('convertToInteractive',\n", |
930 | | - " [key], {});\n", |
931 | | - " if (!dataTable) return;\n", |
932 | | - "\n", |
933 | | - " const docLinkHtml = 'Like what you see? Visit the ' +\n", |
934 | | - " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", |
935 | | - " + ' to learn more about interactive tables.';\n", |
936 | | - " element.innerHTML = '';\n", |
937 | | - " dataTable['output_type'] = 'display_data';\n", |
938 | | - " await google.colab.output.renderOutput(dataTable, element);\n", |
939 | | - " const docLink = document.createElement('div');\n", |
940 | | - " docLink.innerHTML = docLinkHtml;\n", |
941 | | - " element.appendChild(docLink);\n", |
942 | | - " }\n", |
943 | | - " </script>\n", |
944 | | - " </div>\n", |
945 | | - "\n", |
946 | | - "\n", |
947 | | - " </div>\n", |
948 | | - " </div>\n" |
949 | | - ], |
950 | | - "application/vnd.google.colaboratory.intrinsic+json": { |
951 | | - "type": "dataframe", |
952 | | - "summary": "{\n \"name\": \"nearby\",\n \"rows\": 6,\n \"fields\": [\n {\n \"column\": \"uuid_a\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"d274bd96994a45e4b1260c873c37f68a\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"uuid_b\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"fbbc8e9a2dfa4051aa34b04c58b86424\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"date_a\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"2000-08-23\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"date_b\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"2000-08-26\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" |
953 | | - } |
954 | | - }, |
955 | | - "metadata": {}, |
956 | | - "execution_count": 17 |
957 | | - } |
958 | | - ] |
959 | | - }, |
960 | | - { |
961 | | - "cell_type": "markdown", |
962 | | - "id": "6g7yt8huc06", |
963 | | - "metadata": { |
964 | | - "id": "6g7yt8huc06" |
965 | | - }, |
966 | | - "source": [ |
967 | | - "Assigning a shared `flood_event` ID to groups of intersecting, temporally-close polygons is a **connected components** problem. If polygon A overlaps B and B overlaps C (both within 7 days), all three should share the same event ID. We build a sparse adjacency graph from the `nearby` pairs and use `scipy.sparse.csgraph.connected_components` to label every cluster. Isolated polygons (no nearby neighbours) each become their own single-member component." |
968 | | - ] |
969 | | - }, |
970 | | - { |
971 | | - "cell_type": "code", |
972 | | - "execution_count": 18, |
973 | | - "id": "mqi2zhq5h4", |
974 | | - "metadata": { |
975 | | - "colab": { |
976 | | - "base_uri": "https://localhost:8080/" |
977 | | - }, |
978 | | - "id": "mqi2zhq5h4", |
979 | | - "outputId": "8791cecf-50c1-4ba2-bfff-cbb43de84ec6" |
980 | | - }, |
981 | | - "outputs": [ |
982 | | - { |
983 | | - "output_type": "stream", |
984 | | - "name": "stdout", |
985 | | - "text": [ |
986 | | - "Total polygons: 445,244\n", |
987 | | - "Unique flood events: 154,446\n", |
988 | | - "Avg polygons per event: 2.88\n" |
989 | | - ] |
990 | | - } |
991 | | - ], |
992 | | - "source": [ |
993 | | - "from scipy.sparse import coo_matrix\n", |
994 | | - "from scipy.sparse.csgraph import connected_components\n", |
| 754 | + "# Now create a 'nearby' dataframe only for valid pairs\n", |
| 755 | + "nearby = pd.DataFrame({\n", |
| 756 | + " 'idx_a': nearby_left,\n", |
| 757 | + " 'idx_b': nearby_right\n", |
| 758 | + "})\n", |
995 | 759 | "\n", |
| 760 | + "# Now use connected_components to assign a shared `flood_event` id to groups\n", |
| 761 | + "# of intersecting temporally-close polygons\n", |
996 | 762 | "n = len(gdf_flood)\n", |
997 | 763 | "\n", |
998 | 764 | "# Build a symmetric sparse adjacency matrix from the nearby pairs\n", |
|
1007 | 773 | "print(f'Avg polygons per event: {n / n_components:>10.2f}')" |
1008 | 774 | ] |
1009 | 775 | }, |
| 776 | + { |
| 777 | + "cell_type": "markdown", |
| 778 | + "source": [ |
| 779 | + "Add the labels to the main GeoDataFrame." |
| 780 | + ], |
| 781 | + "metadata": { |
| 782 | + "id": "AX_rYadyWMBE" |
| 783 | + }, |
| 784 | + "id": "AX_rYadyWMBE" |
| 785 | + }, |
1010 | 786 | { |
1011 | 787 | "cell_type": "code", |
1012 | 788 | "execution_count": 19, |
|
0 commit comments