Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
64a6e40
:bug: Refactor data loaders to be lazy and use generators to prevent …
garlontas Jun 6, 2025
7aff681
fix: address all loader PR review comments
Copilot Apr 12, 2026
150cd58
Merge pull request #121 from pickwicksoft/copilot/modify-all-loaders
garlontas Apr 12, 2026
3032d8d
refactor: handle StopIteration exceptions and enhance documentation
deepsource-autofix[bot] Apr 12, 2026
77e2cb0
♻ Fix problems caused by outdated CI/CD, deps and code quality
garlontas Apr 12, 2026
073ac1c
refactor: consolidate one-line docstrings
deepsource-autofix[bot] Apr 12, 2026
5153c28
♻ Fix bugs in test and codestyle breaking CI
garlontas Apr 12, 2026
3d851f9
:green_heart: Update GitHub Actions to use latest versions of checkou…
garlontas Apr 12, 2026
04df098
fix: update action versions in workflows and improve XML parsing docu…
garlontas Apr 12, 2026
158eea5
fix: downgrade cache action version in workflows for compatibility
garlontas Apr 12, 2026
3e52251
Remove duplicate code in loader tests via shared LoaderTestBase
Copilot Apr 13, 2026
9943b9f
Fix LoaderTestBase warning: use mixin pattern instead of inheriting T…
Copilot Apr 13, 2026
0048db0
Fix pylint E1129: replace parenthesized with-tuple with nested with s…
Copilot Apr 13, 2026
2364d55
Refactor file mock context manager for readability
garlontas Apr 13, 2026
fe05cc8
Remove Python versions 3.8 and 3.9 from workflow
garlontas Apr 13, 2026
fbbe279
Merge pull request #122 from pickwicksoft/copilot/remove-duplicate-co…
garlontas Apr 13, 2026
c0710d7
Refactor CSV loader tests for improved readability and structure
garlontas Apr 13, 2026
4640bfb
Improve loader tests to handle various scenarios
garlontas Apr 13, 2026
8575eeb
lint: Add final newline
garlontas Apr 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ jobs:
run: pip install tox
- name: Run tox
run: tox -e py
- name: SonarCloud Scan
uses: SonarSource/sonarcloud-github-action@master
- name: SonarQube Scan
uses: SonarSource/sonarqube-scan-action@299e4b793aaa83bf2aba7c9c14bedbb485688ec4
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
24 changes: 12 additions & 12 deletions .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
python-version: [ "3.10", "3.11", "3.12" ]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pylint
- name: Analysing the code with pylint
run: |
pylint $(git ls-files '*.py')
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pylint
- name: Analysing the code with pylint
run: |
pylint $(git ls-files '*.py')
8 changes: 4 additions & 4 deletions .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6

# If you wanted to use multiple Python versions, you'd have specify a matrix in the job and
# reference the matrixe python version here.
- uses: actions/setup-python@v5
- uses: actions/setup-python@v6
with:
python-version: 3.9
python-version: '3.10'

# Cache the installation of Poetry itself, e.g. the next step. This prevents the workflow
# from installing Poetry every time, which can be slow. Note the use of the Poetry version
Expand All @@ -45,7 +45,7 @@ jobs:
# The key configuration value here is `virtualenvs-in-project: true`: this creates the
# venv as a `.venv` in your testing directory, which allows the next step to easily
# cache it.
- uses: snok/install-poetry@v1
- uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a
with:
version: 2.1.0
virtualenvs-create: true
Expand Down
1,536 changes: 927 additions & 609 deletions poetry.lock

Large diffs are not rendered by default.

65 changes: 43 additions & 22 deletions pystreamapi/loaders/__csv/__csv_loader.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,62 @@
from collections import namedtuple
from csv import reader
from io import StringIO
from typing import Any, Iterator

from pystreamapi.loaders.__loader_utils import LoaderUtils
from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable


def csv(file_path: str, cast_types=True, delimiter=',', encoding="utf-8") -> LazyFileIterable:
def csv(
src: str, read_from_src=False, cast_types=True, delimiter=',', encoding="utf-8"
) -> Iterator[Any]:
"""
Loads a CSV file and converts it into a list of namedtuples.

Returns:
list: A list of namedtuples, where each namedtuple represents a row in the CSV.
:param cast_types: Set as False to disable casting of values to int, bool or float.
:param encoding: The encoding of the CSV file.
:param file_path: The path to the CSV file.
:param delimiter: The delimiter used in the CSV file.
Lazily loads CSV data from either a path or a string and yields namedtuples.

Args:
src (str): Either the path to a CSV file or a CSV string.
read_from_src (bool): If True, src is treated as a CSV string.
If False, src is treated as a path to a CSV file.
cast_types (bool): Set as False to disable casting of values to int, bool or float.
delimiter (str): The delimiter used in the CSV data.
encoding (str): The encoding of the CSV file (only used when reading from file).

Yields:
namedtuple: Each row in the CSV as a namedtuple.
"""
file_path = LoaderUtils.validate_path(file_path)
return LazyFileIterable(lambda: __load_csv(file_path, cast_types, delimiter, encoding))
if not read_from_src:
src = LoaderUtils.validate_path(src)
return __load_csv_from_file(src, cast_types, delimiter, encoding)
return __load_csv_from_string(src, cast_types, delimiter)


def __load_csv(file_path, cast, delimiter, encoding):
"""Load a CSV file and convert it into a list of namedtuples"""
def __load_csv_from_file(file_path, cast, delimiter, encoding):
"""Load a CSV file and convert it into a generator of namedtuples"""
# skipcq: PTC-W6004
with open(file_path, mode='r', newline='', encoding=encoding) as csvfile:
csvreader = reader(csvfile, delimiter=delimiter)
yield from __process_csv(csvfile, cast, delimiter)


def __load_csv_from_string(csv_string, cast, delimiter):
"""Load a CSV from string and convert it into a generator of namedtuples"""
with StringIO(csv_string) as csvfile:
yield from __process_csv(csvfile, cast, delimiter)


# Create a namedtuple type, casting the header values to int or float if possible
header = __get_csv_header(csvreader)
def __process_csv(csvfile, cast, delimiter):
"""Process CSV data and yield namedtuples"""
csvreader = reader(csvfile, delimiter=delimiter)

Row = namedtuple('Row', list(header))
# Create a namedtuple type, casting the header values to int or float if possible
header = __get_csv_header(csvreader)
if not header:
return

mapper = LoaderUtils.try_cast if cast else lambda x: x
Row = namedtuple('Row', list(header))
mapper = LoaderUtils.try_cast if cast else lambda x: x

# Process the data, casting values to int or float if possible
data = [Row(*[mapper(value) for value in row]) for row in csvreader]
return data
# Yield the data row by row, casting values to int or float if possible
for row in csvreader:
yield Row(*[mapper(value) for value in row])


def __get_csv_header(csvreader):
Expand Down
65 changes: 43 additions & 22 deletions pystreamapi/loaders/__json/__json_loader.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,61 @@
import json as jsonlib
from collections import namedtuple
from typing import Any, Iterator

from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
from pystreamapi.loaders.__loader_utils import LoaderUtils


def json(src: str, read_from_src=False) -> LazyFileIterable:
def json(src: str, read_from_src=False) -> Iterator[Any]:
"""
Loads JSON data from either a path or a string and converts it into a list of namedtuples.
Lazily loads JSON data from either a path or a string and yields namedtuples.

Returns:
list: A list of namedtuples, where each namedtuple represents an object in the JSON.
:param src: Either the path to a JSON file or a JSON string.
:param read_from_src: If True, src is treated as a JSON string. If False, src is treated as
a path to a JSON file.
Args:
src (str): Either the path to a JSON file or a JSON string.
read_from_src (bool): If True, src is treated as a JSON string.
If False, src is treated as a path to a JSON file.

Yields:
namedtuple: Each object in the JSON as a namedtuple.
"""
if read_from_src:
return LazyFileIterable(lambda: __load_json_string(src))
return __lazy_load_json_string(src)
path = LoaderUtils.validate_path(src)
return LazyFileIterable(lambda: __load_json_file(path))
return __lazy_load_json_file(path)


def __lazy_load_json_file(file_path: str) -> Iterator[Any]:
"""Lazily read and parse a JSON file, yielding namedtuples."""

def generator():
"""Generate namedtuples from the JSON file contents."""
# skipcq: PTC-W6004
with open(file_path, mode='r', encoding='utf-8') as jsonfile:
src = jsonfile.read()
if not src.strip():
return
result = jsonlib.loads(src, object_hook=__dict_to_namedtuple)
if isinstance(result, list):
yield from result
else:
yield result

return generator()


def __load_json_file(file_path):
"""Load a JSON file and convert it into a list of namedtuples"""
# skipcq: PTC-W6004
with open(file_path, mode='r', encoding='utf-8') as jsonfile:
src = jsonfile.read()
if src == '':
return []
data = jsonlib.loads(src, object_hook=__dict_to_namedtuple)
return data
def __lazy_load_json_string(json_string: str) -> Iterator[Any]:
"""Lazily parse a JSON string, yielding namedtuples."""

def generator():
"""Internal generator that yields namedtuples by parsing the JSON string on demand."""
if not json_string.strip():
return
result = jsonlib.loads(json_string, object_hook=__dict_to_namedtuple)
if isinstance(result, list):
yield from result
else:
yield result

def __load_json_string(json_string):
"""Load JSON data from a string and convert it into a list of namedtuples"""
return jsonlib.loads(json_string, object_hook=__dict_to_namedtuple)
return generator()


def __dict_to_namedtuple(d, name='Item'):
Expand Down
95 changes: 47 additions & 48 deletions pystreamapi/loaders/__xml/__xml_loader.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,24 @@
from typing import Iterator, Any

try:
from defusedxml import ElementTree
except ImportError as exc:
raise ImportError(
"Please install the xml_loader extra dependency to use the xml loader."
) from exc
from collections import namedtuple
from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
from pystreamapi.loaders.__loader_utils import LoaderUtils


class __XmlLoaderUtil:
"""Utility class for the XML loader."""

def __init__(self):
self.cast_types = True
self.retrieve_children = True


config = __XmlLoaderUtil()


def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True,
encoding="utf-8") -> LazyFileIterable:
encoding="utf-8") -> Iterator[Any]:
"""
Loads XML data from either a path or a string and converts it into a list of namedtuples.
Warning: This method isn't safe against malicious XML trees. Parse only safe XML from sources
you trust.

Returns:
LazyFileIterable: A list of namedtuples, where each namedtuple represents an XML element.
An iterator with namedtuples, where each namedtuple represents an XML element.
:param retrieve_children: If true, the children of the root element are used as stream
elements.
:param encoding: The encoding of the XML file.
Expand All @@ -37,65 +27,76 @@ def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True,
a path to an XML file.
:param cast_types: Set as False to disable casting of values to int, bool or float.
"""
config.cast_types = cast_types
config.retrieve_children = retrieve_children
if read_from_src:
return LazyFileIterable(lambda: __load_xml_string(src))
return _lazy_parse_xml_string(src, retrieve_children, cast_types)

path = LoaderUtils.validate_path(src)
return LazyFileIterable(lambda: __load_xml_file(path, encoding))
return _lazy_parse_xml_file(path, encoding, retrieve_children, cast_types)


def _lazy_parse_xml_file(file_path: str, encoding: str,
retrieve_children: bool, cast_types: bool) -> Iterator[Any]:
"""Lazily parse an XML file by reading its content and yielding parsed namedtuples."""
def generator():
"""Generator that reads the XML file and yields parsed namedtuples lazily."""
# skipcq: PTC-W6004
with open(file_path, mode='r', encoding=encoding) as xmlfile:
xml_string = xmlfile.read()
yield from _parse_xml_string_lazy(xml_string, retrieve_children, cast_types)

return generator()

def __load_xml_file(file_path, encoding):
"""Load an XML file and convert it into a list of namedtuples."""
# skipcq: PTC-W6004
with open(file_path, mode='r', encoding=encoding) as xmlfile:
src = xmlfile.read()
if src:
return __parse_xml_string(src)
return []

def _lazy_parse_xml_string(xml_string: str, retrieve_children: bool,
cast_types: bool) -> Iterator[Any]:
"""Lazily parse an XML string by yielding parsed namedtuples for each element."""
def generator():
"""Generator that yields parsed namedtuples from the XML string lazily."""
yield from _parse_xml_string_lazy(xml_string, retrieve_children, cast_types)

def __load_xml_string(xml_string):
"""Load XML data from a string and convert it into a list of namedtuples."""
return __parse_xml_string(xml_string)
return generator()


def __parse_xml_string(xml_string):
"""Parse XML string and convert it into a list of namedtuples."""
def _parse_xml_string_lazy(xml_string: str, retrieve_children: bool,
cast_types: bool) -> Iterator[Any]:
"""Parse an XML string into namedtuples, optionally yielding child elements lazily."""
root = ElementTree.fromstring(xml_string)
parsed_xml = __parse_xml(root)
return __flatten(parsed_xml) if config.retrieve_children else [parsed_xml]
parsed = __parse_xml(root, cast_types)
if retrieve_children:
yield from __flatten(parsed)
else:
yield parsed


def __parse_xml(element):
def __parse_xml(element, cast_types: bool):
"""Parse XML element and convert it into a namedtuple."""
if len(element) == 0:
return __parse_empty_element(element)
return __parse_empty_element(element, cast_types)
if len(element) == 1:
return __parse_single_element(element)
return __parse_multiple_elements(element)
return __parse_single_element(element, cast_types)
return __parse_multiple_elements(element, cast_types)


def __parse_empty_element(element):
def __parse_empty_element(element, cast_types: bool):
"""Parse XML element without children and convert it into a namedtuple."""
return LoaderUtils.try_cast(element.text) if config.cast_types else element.text
return LoaderUtils.try_cast(element.text) if cast_types else element.text


def __parse_single_element(element):
def __parse_single_element(element, cast_types: bool):
"""Parse XML element with a single child and convert it into a namedtuple."""
sub_element = element[0]
sub_item = __parse_xml(sub_element)
sub_item = __parse_xml(sub_element, cast_types)
Item = namedtuple(element.tag, [sub_element.tag])
return Item(sub_item)


def __parse_multiple_elements(element):
def __parse_multiple_elements(element, cast_types: bool):
"""Parse XML element with multiple children and convert it into a namedtuple."""
tag_dict = {}
for e in element:
if e.tag not in tag_dict:
tag_dict[e.tag] = []
tag_dict[e.tag].append(__parse_xml(e))
tag_dict[e.tag].append(__parse_xml(e, cast_types))
filtered_dict = __filter_single_items(tag_dict)
Item = namedtuple(element.tag, filtered_dict.keys())
return Item(*filtered_dict.values())
Expand All @@ -107,11 +108,9 @@ def __filter_single_items(tag_dict):


def __flatten(data):
"""Flatten a list of lists."""
res = []
"""Yield flattened elements from a possibly nested structure."""
for item in data:
if isinstance(item, list):
res.extend(item)
yield from item
else:
res.append(item)
return res
yield item
Loading
Loading