Update download urls from figshare to s3 bucket

stes · stes · commit ca9a61d7e669 · 2026-01-31T18:51:44.000Z
diff --git a/cebra/data/assets.py b/cebra/data/assets.py
@@ -20,6 +20,7 @@
 # limitations under the License.
 #
 
+import gzip
 import hashlib
 import re
 import warnings
@@ -140,3 +141,128 @@ def calculate_checksum(file_path: str) -> str:
         for chunk in iter(lambda: file.read(4096), b""):
             checksum.update(chunk)
     return checksum.hexdigest()
+
+
+def download_and_extract_gzipped_file(url: str,
+                                      expected_checksum: str,
+                                      gzipped_checksum: str,
+                                      location: str,
+                                      file_name: str,
+                                      retry_count: int = 0) -> Optional[str]:
+    """Download a gzipped file from the given URL, verify checksums, and extract.
+
+    This function downloads a gzipped file, verifies the checksum of the gzipped
+    file, extracts it, and then verifies the checksum of the extracted file.
+
+    Args:
+        url: The URL to download the gzipped file from.
+        expected_checksum: The expected MD5 checksum of the extracted file.
+        gzipped_checksum: The expected MD5 checksum of the gzipped file.
+        location: The directory where the file will be saved.
+        file_name: The name of the final extracted file (without .gz extension).
+        retry_count: The number of retry attempts (default: 0).
+
+    Returns:
+        The path of the extracted file if successful, None otherwise.
+
+    Raises:
+        RuntimeError: If the maximum retry count is exceeded.
+        requests.HTTPError: If the download fails.
+    """
+
+    # Check if the final extracted file already exists with correct checksum
+    location_path = Path(location)
+    final_file_path = location_path / file_name
+
+    if final_file_path.exists():
+        existing_checksum = calculate_checksum(final_file_path)
+        if existing_checksum == expected_checksum:
+            return final_file_path
+
+    if retry_count >= _MAX_RETRY_COUNT:
+        raise RuntimeError(
+            f"Exceeded maximum retry count ({_MAX_RETRY_COUNT}). "
+            f"Unable to download the file from {url}")
+
+    # Create the directory and any necessary parent directories
+    location_path.mkdir(parents=True, exist_ok=True)
+
+    # Download the gzipped file
+    gz_file_path = location_path / f"{file_name}.gz"
+
+    response = requests.get(url, stream=True)
+
+    # Check if the request was successful
+    if response.status_code != 200:
+        raise requests.HTTPError(
+            f"Error occurred while downloading the file. Response code: {response.status_code}"
+        )
+
+    total_size = int(response.headers.get("Content-Length", 0))
+    checksum = hashlib.md5()  # create checksum for gzipped file
+
+    # Download the gzipped file
+    with open(gz_file_path, "wb") as file:
+        with tqdm.tqdm(total=total_size,
+                       unit="B",
+                       unit_scale=True,
+                       desc="Downloading") as progress_bar:
+            for data in response.iter_content(chunk_size=1024):
+                file.write(data)
+                checksum.update(data)
+                progress_bar.update(len(data))
+
+    downloaded_gz_checksum = checksum.hexdigest()
+
+    # Verify gzipped file checksum
+    if downloaded_gz_checksum != gzipped_checksum:
+        warnings.warn(
+            f"Gzipped file checksum verification failed. Deleting '{gz_file_path}'."
+        )
+        gz_file_path.unlink()
+        warnings.warn("Gzipped file deleted. Retrying download...")
+        return download_and_extract_gzipped_file(url, expected_checksum,
+                                                 gzipped_checksum, location,
+                                                 file_name, retry_count + 1)
+
+    print("Gzipped file checksum verified. Extracting...")
+
+    # Extract the gzipped file
+    try:
+        with gzip.open(gz_file_path, 'rb') as f_in:
+            with open(final_file_path, 'wb') as f_out:
+                # Extract with progress (estimate based on typical compression ratio)
+                extracted_size = 0
+                while True:
+                    chunk = f_in.read(8192)
+                    if not chunk:
+                        break
+                    f_out.write(chunk)
+                    extracted_size += len(chunk)
+    except Exception as e:
+        warnings.warn(f"Extraction failed: {e}. Deleting files and retrying...")
+        if gz_file_path.exists():
+            gz_file_path.unlink()
+        if final_file_path.exists():
+            final_file_path.unlink()
+        return download_and_extract_gzipped_file(url, expected_checksum,
+                                                 gzipped_checksum, location,
+                                                 file_name, retry_count + 1)
+
+    # Verify extracted file checksum
+    extracted_checksum = calculate_checksum(final_file_path)
+    if extracted_checksum != expected_checksum:
+        warnings.warn(
+            "Extracted file checksum verification failed. Deleting files.")
+        gz_file_path.unlink()
+        final_file_path.unlink()
+        warnings.warn("Files deleted. Retrying download...")
+        return download_and_extract_gzipped_file(url, expected_checksum,
+                                                 gzipped_checksum, location,
+                                                 file_name, retry_count + 1)
+
+    # Clean up the gzipped file after successful extraction
+    gz_file_path.unlink()
+
+    print(f"Extraction complete. Dataset saved in '{final_file_path}'")
+    return final_file_path
diff --git a/cebra/data/base.py b/cebra/data/base.py
@@ -55,6 +55,7 @@ def __init__(self,
                  download=False,
                  data_url=None,
                  data_checksum=None,
+                 gzipped_checksum=None,
                  location=None,
                  file_name=None):
 
@@ -64,6 +65,7 @@ def __init__(self,
         self.download = download
         self.data_url = data_url
         self.data_checksum = data_checksum
+        self.gzipped_checksum = gzipped_checksum
         self.location = location
         self.file_name = file_name
 
@@ -78,11 +80,21 @@ def __init__(self,
                     "Missing data checksum. Please provide the checksum to verify the data integrity."
                 )
 
-            cebra_data_assets.download_file_with_progress_bar(
-                url=self.data_url,
-                expected_checksum=self.data_checksum,
-                location=self.location,
-                file_name=self.file_name)
+            # Use gzipped download if gzipped_checksum is provided
+            if self.gzipped_checksum is not None:
+                cebra_data_assets.download_and_extract_gzipped_file(
+                    url=self.data_url,
+                    expected_checksum=self.data_checksum,
+                    gzipped_checksum=self.gzipped_checksum,
+                    location=self.location,
+                    file_name=self.file_name)
+            else:
+                # Fall back to legacy download for backward compatibility
+                cebra_data_assets.download_file_with_progress_bar(
+                    url=self.data_url,
+                    expected_checksum=self.data_checksum,
+                    location=self.location,
+                    file_name=self.file_name)
 
     @property
     @abc.abstractmethod
diff --git a/cebra/datasets/hippocampus.py b/cebra/datasets/hippocampus.py
@@ -50,27 +50,35 @@
 rat_dataset_urls = {
     "achilles": {
         "url":
-            "https://figshare.com/ndownloader/files/40849463?private_link=9f91576cbbcc8b0d8828",
+            "https://cebra.fra1.digitaloceanspaces.com/data/rat_hippocampus/achilles.jl.gz",
         "checksum":
-            "c52f9b55cbc23c66d57f3842214058b8"
+            "c52f9b55cbc23c66d57f3842214058b8",
+        "gzipped_checksum":
+            "5d7b243e07b24c387e5412cd5ff46f0b"
     },
     "buddy": {
         "url":
-            "https://figshare.com/ndownloader/files/40849460?private_link=9f91576cbbcc8b0d8828",
+            "https://cebra.fra1.digitaloceanspaces.com/data/rat_hippocampus/buddy.jl.gz",
         "checksum":
-            "36341322907708c466871bf04bc133c2"
+            "36341322907708c466871bf04bc133c2",
+        "gzipped_checksum":
+            "339290585be2188f48a176f05aaf5df6"
     },
     "cicero": {
         "url":
-            "https://figshare.com/ndownloader/files/40849457?private_link=9f91576cbbcc8b0d8828",
+            "https://cebra.fra1.digitaloceanspaces.com/data/rat_hippocampus/cicero.jl.gz",
         "checksum":
-            "a83b02dbdc884fdd7e53df362499d42f"
+            "a83b02dbdc884fdd7e53df362499d42f",
+        "gzipped_checksum":
+            "f262a87d2e59f164cb404cd410015f3a"
     },
     "gatsby": {
         "url":
-            "https://figshare.com/ndownloader/files/40849454?private_link=9f91576cbbcc8b0d8828",
+            "https://cebra.fra1.digitaloceanspaces.com/data/rat_hippocampus/gatsby.jl.gz",
         "checksum":
-            "2b889da48178b3155011c12555342813"
+            "2b889da48178b3155011c12555342813",
+        "gzipped_checksum":
+            "564e431c19e55db2286a9d64c86a94c4"
     }
 }
 
@@ -95,11 +103,13 @@ def __init__(self, name="achilles", root=_DEFAULT_DATADIR, download=True):
         location = pathlib.Path(root) / "rat_hippocampus"
         file_path = location / f"{name}.jl"
 
-        super().__init__(download=download,
-                         data_url=rat_dataset_urls[name]["url"],
-                         data_checksum=rat_dataset_urls[name]["checksum"],
-                         location=location,
-                         file_name=f"{name}.jl")
+        super().__init__(
+            download=download,
+            data_url=rat_dataset_urls[name]["url"],
+            data_checksum=rat_dataset_urls[name]["checksum"],
+            gzipped_checksum=rat_dataset_urls[name].get("gzipped_checksum"),
+            location=location,
+            file_name=f"{name}.jl")
 
         data = joblib.load(file_path)
         self.neural = torch.from_numpy(data["spikes"]).float()
diff --git a/cebra/datasets/monkey_reaching.py b/cebra/datasets/monkey_reaching.py
@@ -160,75 +160,99 @@ def _get_info(trial_info, data):
 monkey_reaching_urls = {
     "all_all.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668764?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/all_all.jl.gz",
         "checksum":
-            "dea556301fa4fafa86e28cf8621cab5a"
+            "dea556301fa4fafa86e28cf8621cab5a",
+        "gzipped_checksum":
+            "399abc6e9ef0b23a0d6d057c6f508939"
     },
     "all_train.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668752?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/all_train.jl.gz",
         "checksum":
-            "e280e4cd86969e6fd8bfd3a8f402b2fe"
+            "e280e4cd86969e6fd8bfd3a8f402b2fe",
+        "gzipped_checksum":
+            "eb52c8641fe83ae2a278b372ddec5f69"
     },
     "all_test.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668761?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/all_test.jl.gz",
         "checksum":
-            "25d3ff2c15014db8b8bf2543482ae881"
+            "25d3ff2c15014db8b8bf2543482ae881",
+        "gzipped_checksum":
+            "7688245cf15e0b92503af943ce9f66aa"
     },
     "all_valid.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668755?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/all_valid.jl.gz",
         "checksum":
-            "8cd25169d31f83ae01b03f7b1b939723"
+            "8cd25169d31f83ae01b03f7b1b939723",
+        "gzipped_checksum":
+            "b169fc008b4d092fe2a1b7e006cd17a7"
     },
     "active_all.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668776?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/active_all.jl.gz",
         "checksum":
-            "c626acea5062122f5a68ef18d3e45e51"
+            "c626acea5062122f5a68ef18d3e45e51",
+        "gzipped_checksum":
+            "b7b86e2ae00bb71341de8fc352dae097"
     },
     "active_train.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668770?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/active_train.jl.gz",
         "checksum":
-            "72a48056691078eee22c36c1992b1d37"
+            "72a48056691078eee22c36c1992b1d37",
+        "gzipped_checksum":
+            "56687c633efcbff6c56bbcfa35597565"
     },
     "active_test.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668773?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/active_test.jl.gz",
         "checksum":
-            "35b7e060008a8722c536584c4748f2ea"
+            "35b7e060008a8722c536584c4748f2ea",
+        "gzipped_checksum":
+            "2057ef1846908a69486a61895d1198e8"
     },
     "active_valid.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668767?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/active_valid.jl.gz",
         "checksum":
-            "dd58eb1e589361b4132f34b22af56b79"
+            "dd58eb1e589361b4132f34b22af56b79",
+        "gzipped_checksum":
+            "60b8e418f234877351fe36f1efc169ad"
     },
     "passive_all.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668758?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/passive_all.jl.gz",
         "checksum":
-            "bbb1bc9d8eec583a46f6673470fc98ad"
+            "bbb1bc9d8eec583a46f6673470fc98ad",
+        "gzipped_checksum":
+            "afb257efa0cac3ccd69ec80478d63691"
     },
     "passive_train.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668743?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/passive_train.jl.gz",
         "checksum":
-            "f22e05a69f70e18ba823a0a89162a45c"
+            "f22e05a69f70e18ba823a0a89162a45c",
+        "gzipped_checksum":
+            "24d98d7d41a52591f838c41fe83dc2c6"
     },
     "passive_test.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668746?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/passive_test.jl.gz",
         "checksum":
-            "42453ae3e4fd27d82d297f78c13cd6b7"
+            "42453ae3e4fd27d82d297f78c13cd6b7",
+        "gzipped_checksum":
+            "f1ff4e9b7c4a0d7fa9dcd271893f57ab"
     },
     "passive_valid.jl": {
         "url":
-            "https://figshare.com/ndownloader/files/41668749?private_link=6fa4ee74a8f465ec7914",
+            "https://cebra.fra1.digitaloceanspaces.com/data/monkey_reaching_preload_smth_40/passive_valid.jl.gz",
         "checksum":
-            "2dcc10c27631b95a075eaa2d2297bb4a"
+            "2dcc10c27631b95a075eaa2d2297bb4a",
+        "gzipped_checksum":
+            "311fcb6a3e86022f12d78828f7bd29d5"
     }
 }
 
@@ -270,6 +294,8 @@ def __init__(self,
             data_url=monkey_reaching_urls[f"{self.load_session}_all.jl"]["url"],
             data_checksum=monkey_reaching_urls[f"{self.load_session}_all.jl"]
             ["checksum"],
+            gzipped_checksum=monkey_reaching_urls[f"{self.load_session}_all.jl"]
+            .get("gzipped_checksum"),
             location=self.path,
             file_name=f"{self.load_session}_all.jl",
         )
@@ -297,6 +323,8 @@ def split(self, split):
             ["url"],
             data_checksum=monkey_reaching_urls[
                 f"{self.load_session}_{split}.jl"]["checksum"],
+            gzipped_checksum=monkey_reaching_urls[
+                f"{self.load_session}_{split}.jl"].get("gzipped_checksum"),
             location=self.path,
             file_name=f"{self.load_session}_{split}.jl",
         )
diff --git a/cebra/datasets/synthetic_data.py b/cebra/datasets/synthetic_data.py
diff --git a/tests/test_datasets.py b/tests/test_datasets.py