diff --git a/docs/sphinx/source/whatsnew/v0.15.2.rst b/docs/sphinx/source/whatsnew/v0.15.2.rst index 327b36c1ae..5d52960900 100644 --- a/docs/sphinx/source/whatsnew/v0.15.2.rst +++ b/docs/sphinx/source/whatsnew/v0.15.2.rst @@ -22,6 +22,10 @@ Bug fixes introduced in v0.15.1 (:pull:`2702`) that caused a broadcasting ``ValueError`` when ``tracker_theta`` was a 2-D (or higher rank) array. (:issue:`2747`, :pull:`2749`) +* :py:func:`pvlib.iotools.read_nsrdb_psm4` now parses the file header with the + :py:mod:`csv` module instead of a naive ``str.split(',')``, so quoted column + names containing commas (e.g. the material names in spectral-on-demand files) + are no longer split into spurious columns. (:issue:`2736`, :pull:`2771`) Enhancements ~~~~~~~~~~~~ @@ -63,6 +67,7 @@ Maintenance Contributors ~~~~~~~~~~~~ * :ghuser:`Omesh37` +* :ghuser:`gaoflow` * Cliff Hansen (:ghuser:`cwhanse`) * :ghuser:`shethkajal7` * Arthur Onno (:ghuser:`ArthurOnnoTerabase`) diff --git a/pvlib/iotools/psm4.py b/pvlib/iotools/psm4.py index 9eb760f382..fc8d098a09 100644 --- a/pvlib/iotools/psm4.py +++ b/pvlib/iotools/psm4.py @@ -6,6 +6,7 @@ https://developer.nlr.gov/docs/solar/nsrdb/nsrdb-GOES-full-disc-v4-0-0-download/ """ +import csv import io from urllib.parse import urljoin import requests @@ -723,11 +724,16 @@ def read_nsrdb_psm4(filename, map_variables=True): `_ """ with tools._file_context_manager(filename) as fbuf: + # The first 3 header lines are parsed with the csv module rather than a + # naive str.split(',') so that quoted fields containing commas are kept + # intact. Spectral-on-demand files, for instance, have column names + # like '"GaAs (Bauhuis et al., 2009)"' whose embedded commas would + # otherwise be split into spurious columns (see GH #2736). # The first 2 lines of the response are headers with metadata - metadata_fields = fbuf.readline().split(',') - metadata_values = fbuf.readline().split(',') + metadata_fields = next(csv.reader([fbuf.readline()])) + metadata_values = next(csv.reader([fbuf.readline()])) # get the column names so we can set the dtypes - columns = fbuf.readline().split(',') + columns = next(csv.reader([fbuf.readline()])) columns[-1] = columns[-1].strip() # strip trailing newline # Since the header has so many columns, excel saves blank cols in the # data below the header lines. diff --git a/tests/iotools/test_psm4.py b/tests/iotools/test_psm4.py index 3b4313b070..c16a714aa3 100644 --- a/tests/iotools/test_psm4.py +++ b/tests/iotools/test_psm4.py @@ -185,6 +185,31 @@ def test_read_nsrdb_psm4_map_variables(): assert_index_equal(data.columns, pd.Index(columns_mapped)) +def test_read_nsrdb_psm4_quoted_columns_with_commas(): + """spectral-on-demand files have quoted column names containing commas; + these must not be split into spurious columns (GH #2736)""" + # Minimal NSRDB file whose column header (3rd line) has quoted material + # names with embedded commas, which is valid CSV. A naive str.split(',') + # would break these into extra columns and raise on read. + content = ( + "Source,Location ID,City,State,Country,Latitude,Longitude,Time Zone," + "Elevation,Local Time Zone,Version\n" + "NSRDB,1,-,-,-,40.0,-105.0,-7,1600,-7,4.0.1\n" + 'Year,Month,Day,Hour,Minute,GHI,"GaAs (Bauhuis et al., 2009)",' + '"InGaP (Gray, 2008)"\n' + "2023,1,1,0,0,0,0.1,0.2\n" + "2023,1,1,1,0,5,0.3,0.4\n" + ) + data, metadata = psm4.read_nsrdb_psm4(StringIO(content), + map_variables=False) + assert list(data.columns) == [ + 'Year', 'Month', 'Day', 'Hour', 'Minute', 'GHI', + 'GaAs (Bauhuis et al., 2009)', 'InGaP (Gray, 2008)'] + assert data.shape == (2, 8) + # the embedded-comma data columns round-trip as floats + assert data['GaAs (Bauhuis et al., 2009)'].tolist() == [0.1, 0.3] + + @pytest.mark.remote_data @pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY) def test_get_nsrdb_psm4_aggregated_parameter_mapping(nlr_api_key):