Skip to content

Commit 435352c

Browse files
committed
Add mandatory mode parameter support to evaluation function and tests
Refactors the evaluation function to require a `mode` parameter (`demo` or `io_test`) for defining behavior. Updates unit tests and Postman collection to include the required `mode` parameter. Adds error handling for missing/unknown mode values with appropriate feedback.
1 parent 9cb2d66 commit 435352c

3 files changed

Lines changed: 116 additions & 58 deletions

File tree

evaluation_function/evaluation.py

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -76,31 +76,28 @@ def _upload_plots(images: list[Image.Image]) -> list[str]:
7676
return result
7777

7878

79-
def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
80-
tests = params.get("tests", [])
81-
result = Result()
82-
83-
if not tests:
84-
stdout, stderr, timed_out, images = _run_code(str(response), "")
85-
if timed_out:
86-
result.add_feedback("error", f"Code timed out after {_TIMEOUT}s.")
87-
elif stderr and not stdout:
88-
result.add_feedback("error", _code_block("Error", stderr.strip()))
89-
else:
90-
parts = [_code_block("Output", stdout.rstrip() or "(no output)")]
91-
parts.extend(_upload_plots(images))
92-
result.add_feedback("output", "\n\n".join(parts))
93-
return result
79+
def _evaluate_demo(response: str, result: Result) -> Result:
80+
stdout, stderr, timed_out, images = _run_code(response, "")
81+
if timed_out:
82+
result.add_feedback("error", f"Code timed out after {_TIMEOUT}s.")
83+
elif stderr and not stdout:
84+
result.add_feedback("error", _code_block("Error", stderr.strip()))
85+
else:
86+
parts = [_code_block("Output", stdout.rstrip() or "(no output)")]
87+
parts.extend(_upload_plots(images))
88+
result.add_feedback("output", "\n\n".join(parts))
89+
return result
9490

9591

92+
def _evaluate_io(response: str, tests: list, result: Result) -> Result:
9693
passed = 0
9794

9895
for i, test in enumerate(tests, 1):
9996
stdin = test.get("input", "")
10097
expected = test.get("expected_output", "").rstrip()
10198
hidden = test.get("hidden", False)
10299

103-
stdout, stderr, timed_out, images = _run_code(str(response), stdin)
100+
stdout, stderr, timed_out, images = _run_code(response, stdin)
104101
actual = stdout.rstrip()
105102
label = f"Hidden test {i}" if hidden else f"Test {i}"
106103

@@ -143,4 +140,16 @@ def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
143140

144141
result.is_correct = passed == len(tests)
145142
result.add_feedback("summary", f"{passed}/{len(tests)} tests passed.")
146-
return result
143+
return result
144+
145+
146+
def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
147+
result = Result()
148+
mode = params.get("mode")
149+
if mode not in ("demo", "io_test"):
150+
result.add_feedback("error", f"Unknown or missing mode: {mode!r}. Expected 'demo' or 'io_test'.")
151+
return result
152+
153+
if mode == "demo":
154+
return _evaluate_demo(str(response), result)
155+
return _evaluate_io(str(response), params.get("tests", []), result)

evaluation_function/evaluation_test.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
def _params(*tests):
14-
return {"tests": list(tests)}
14+
return {"mode": "io_test", "tests": list(tests)}
1515

1616

1717
def _test(inp, expected, hidden=False):
@@ -54,11 +54,17 @@ def test_runtime_error(self):
5454
self.assertIn("```", result["feedback"])
5555

5656
def test_no_tests(self):
57-
result = evaluation_function(_SQUARE_CODE, None, {}).to_dict()
57+
result = evaluation_function(_SQUARE_CODE, None, {"mode": "demo"}).to_dict()
5858

5959
self.assertFalse(result["is_correct"])
6060
self.assertIn("```", result["feedback"])
6161

62+
def test_missing_mode(self):
63+
result = evaluation_function(_SQUARE_CODE, None, {}).to_dict()
64+
65+
self.assertFalse(result["is_correct"])
66+
self.assertIn("mode", result["feedback"])
67+
6268

6369
_PLOT_CODE = "import matplotlib.pyplot as plt\nplt.plot([1, 2, 3])\n"
6470
_MULTI_PLOT_CODE = (
@@ -73,7 +79,7 @@ class TestImageGeneration(unittest.TestCase):
7379

7480
@patch("evaluation_function.evaluation.upload_image", return_value=_FAKE_URL)
7581
def test_single_plot_demo_mode(self, mock_upload):
76-
result = evaluation_function(_PLOT_CODE, None, {}).to_dict()
82+
result = evaluation_function(_PLOT_CODE, None, {"mode": "demo"}).to_dict()
7783
mock_upload.assert_called_once()
7884
self.assertIn("![Plot 1]", result["feedback"])
7985

@@ -91,14 +97,14 @@ def test_single_plot_failing_test(self, mock_upload):
9197

9298
@patch("evaluation_function.evaluation.upload_image", return_value=_FAKE_URL)
9399
def test_multiple_plots(self, mock_upload):
94-
result = evaluation_function(_MULTI_PLOT_CODE, None, {}).to_dict()
100+
result = evaluation_function(_MULTI_PLOT_CODE, None, {"mode": "demo"}).to_dict()
95101
self.assertEqual(mock_upload.call_count, 2)
96102
self.assertIn("![Plot 1]", result["feedback"])
97103
self.assertIn("![Plot 2]", result["feedback"])
98104

99105
@patch("evaluation_function.evaluation.upload_image", side_effect=ImageUploadError)
100106
def test_upload_failure_graceful(self, mock_upload):
101-
result = evaluation_function(_PLOT_CODE, None, {}).to_dict()
107+
result = evaluation_function(_PLOT_CODE, None, {"mode": "demo"}).to_dict()
102108
self.assertNotIn("![Plot", result["feedback"])
103109

104110
def test_run_code_captures_images(self):

0 commit comments

Comments
 (0)