Skip to content

Commit 12ad019

Browse files
committed
base logic change
1 parent 064f543 commit 12ad019

2 files changed

Lines changed: 333 additions & 81 deletions

File tree

mssql_python/pybind/ddbc_bindings.cpp

Lines changed: 97 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2914,6 +2914,10 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
29142914
// Note: wcharEncoding parameter is reserved for future use
29152915
// Currently WCHAR data always uses UTF-16LE for Windows compatibility
29162916
(void)wcharEncoding; // Suppress unused parameter warning
2917+
#if !defined(__APPLE__) && !defined(__linux__)
2918+
// On Windows, VARCHAR is fetched as SQL_C_WCHAR, so charEncoding is unused.
2919+
(void)charEncoding;
2920+
#endif
29172921

29182922
LOG("SQLGetData: Getting data from %d columns for statement_handle=%p", colCount,
29192923
(void*)StatementHandle->get());
@@ -2949,6 +2953,8 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
29492953
case SQL_CHAR:
29502954
case SQL_VARCHAR:
29512955
case SQL_LONGVARCHAR: {
2956+
#if defined(__APPLE__) || defined(__linux__)
2957+
// On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR.
29522958
if (columnSize == SQL_NO_TOTAL || columnSize == 0 ||
29532959
columnSize > SQL_MAX_LOB_SIZE) {
29542960
LOG("SQLGetData: Streaming LOB for column %d (SQL_C_CHAR) "
@@ -2957,34 +2963,16 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
29572963
row.append(
29582964
FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false, charEncoding));
29592965
} else {
2960-
// Allocate columnSize * 4 + 1 on ALL platforms (no #if guard).
2961-
//
2962-
// Why this differs from SQLBindColums / FetchBatchData:
2963-
// Those two functions use #if to apply *4 only on Linux/macOS,
2964-
// because on Windows with a non-UTF-8 collation (e.g. CP1252)
2965-
// each character occupies exactly 1 byte, so *1 suffices and
2966-
// saves memory across the entire batch (fetchSize × numCols
2967-
// buffers).
2968-
//
2969-
// SQLGetData_wrap allocates a single temporary buffer per
2970-
// column per row, so the over-allocation cost is negligible.
2971-
// Using *4 unconditionally here keeps the code simple and
2972-
// correct on every platform—including Windows with a UTF-8
2973-
// collation where multi-byte chars could otherwise cause
2974-
// truncation at the exact column boundary (e.g. CP1252 é in
2975-
// VARCHAR(10)).
2966+
// Allocate columnSize * 4 + 1 to accommodate UTF-8 expansion.
29762967
uint64_t fetchBufferSize = columnSize * 4 + 1 /* null-termination */;
29772968
std::vector<SQLCHAR> dataBuffer(fetchBufferSize);
29782969
SQLLEN dataLen;
29792970
ret = SQLGetData_ptr(hStmt, i, SQL_C_CHAR, dataBuffer.data(), dataBuffer.size(),
29802971
&dataLen);
29812972
if (SQL_SUCCEEDED(ret)) {
2982-
// columnSize is in chars, dataLen is in bytes
29832973
if (dataLen > 0) {
29842974
uint64_t numCharsInData = dataLen / sizeof(SQLCHAR);
29852975
if (numCharsInData < dataBuffer.size()) {
2986-
// SQLGetData will null-terminate the data
2987-
// Use Python's codec system to decode bytes.
29882976
const std::string decodeEncoding =
29892977
GetEffectiveCharDecoding(charEncoding);
29902978
py::bytes raw_bytes(reinterpret_cast<char*>(dataBuffer.data()),
@@ -3001,11 +2989,9 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
30012989
LOG_ERROR(
30022990
"SQLGetData: Failed to decode CHAR column %d with '%s': %s",
30032991
i, decodeEncoding.c_str(), e.what());
3004-
// Return raw bytes as fallback
30052992
row.append(raw_bytes);
30062993
}
30072994
} else {
3008-
// Buffer too small, fallback to streaming
30092995
LOG("SQLGetData: CHAR column %d data truncated "
30102996
"(buffer_size=%zu), using streaming LOB",
30112997
i, dataBuffer.size());
@@ -3037,6 +3023,66 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
30373023
row.append(py::none());
30383024
}
30393025
}
3026+
#else
3027+
// On Windows, request SQL_C_WCHAR so the ODBC driver converts
3028+
// from the server's native encoding (e.g. CP1252) to UTF-16.
3029+
// This avoids the need to guess the server's code page and
3030+
// eliminates the bytes-vs-str inconsistency.
3031+
if (columnSize == SQL_NO_TOTAL || columnSize == 0 ||
3032+
columnSize > SQL_MAX_LOB_SIZE) {
3033+
LOG("SQLGetData: Streaming LOB for column %d (VARCHAR as SQL_C_WCHAR) "
3034+
"- columnSize=%lu",
3035+
i, (unsigned long)columnSize);
3036+
row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false, "utf-16le"));
3037+
} else {
3038+
uint64_t fetchBufferSize =
3039+
(columnSize + 1) * sizeof(SQLWCHAR); // +1 for null terminator
3040+
std::vector<SQLWCHAR> dataBuffer(columnSize + 1);
3041+
SQLLEN dataLen;
3042+
ret = SQLGetData_ptr(hStmt, i, SQL_C_WCHAR, dataBuffer.data(), fetchBufferSize,
3043+
&dataLen);
3044+
if (SQL_SUCCEEDED(ret)) {
3045+
if (dataLen > 0) {
3046+
uint64_t numCharsInData = dataLen / sizeof(SQLWCHAR);
3047+
if (numCharsInData < dataBuffer.size()) {
3048+
std::wstring wstr(reinterpret_cast<wchar_t*>(dataBuffer.data()));
3049+
row.append(py::cast(wstr));
3050+
LOG("SQLGetData: VARCHAR column %d decoded via SQL_C_WCHAR, "
3051+
"length=%lu",
3052+
i, (unsigned long)numCharsInData);
3053+
} else {
3054+
LOG("SQLGetData: VARCHAR column %d data truncated "
3055+
"(as WCHAR), using streaming LOB",
3056+
i);
3057+
row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false,
3058+
"utf-16le"));
3059+
}
3060+
} else if (dataLen == SQL_NULL_DATA) {
3061+
LOG("SQLGetData: Column %d is NULL (VARCHAR via WCHAR)", i);
3062+
row.append(py::none());
3063+
} else if (dataLen == 0) {
3064+
row.append(py::str(""));
3065+
} else if (dataLen == SQL_NO_TOTAL) {
3066+
LOG("SQLGetData: Cannot determine data length "
3067+
"(SQL_NO_TOTAL) for column %d (VARCHAR via WCHAR), "
3068+
"returning NULL",
3069+
i);
3070+
row.append(py::none());
3071+
} else if (dataLen < 0) {
3072+
LOG("SQLGetData: Unexpected negative data length "
3073+
"for column %d (VARCHAR via WCHAR) - dataLen=%ld",
3074+
i, (long)dataLen);
3075+
ThrowStdException("SQLGetData returned an unexpected negative "
3076+
"data length");
3077+
}
3078+
} else {
3079+
LOG("SQLGetData: Error retrieving data for column %d "
3080+
"(VARCHAR via WCHAR) - SQLRETURN=%d, returning NULL",
3081+
i, ret);
3082+
row.append(py::none());
3083+
}
3084+
}
3085+
#endif
30403086
break;
30413087
}
30423088
case SQL_SS_XML: {
@@ -3487,29 +3533,26 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
34873533
// TODO: handle variable length data correctly. This logic wont
34883534
// suffice
34893535
HandleZeroColumnSizeAtFetch(columnSize);
3490-
// Use columnSize * 4 + 1 on Linux/macOS to accommodate UTF-8
3491-
// expansion. The ODBC driver returns UTF-8 for SQL_C_CHAR where
3492-
// each character can be up to 4 bytes.
34933536
#if defined(__APPLE__) || defined(__linux__)
3537+
// On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR
3538+
// where each character can be up to 4 bytes.
34943539
uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/;
3495-
#else
3496-
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
3497-
#endif
3498-
// TODO: For LONGVARCHAR/BINARY types, columnSize is returned as
3499-
// 2GB-1 by SQLDescribeCol. So fetchBufferSize = 2GB.
3500-
// fetchSize=1 if columnSize>1GB. So we'll allocate a vector of
3501-
// size 2GB. If a query fetches multiple (say N) LONG...
3502-
// columns, we will have allocated multiple (N) 2GB sized
3503-
// vectors. This will make driver very slow. And if the N is
3504-
// high enough, we could hit the OS limit for heap memory that
3505-
// we can allocate, & hence get a std::bad_alloc. The process
3506-
// could also be killed by OS for consuming too much memory.
3507-
// Hence this will be revisited in beta to not allocate 2GB+
3508-
// memory, & use streaming instead
35093540
buffers.charBuffers[col - 1].resize(fetchSize * fetchBufferSize);
35103541
ret = SQLBindCol_ptr(hStmt, col, SQL_C_CHAR, buffers.charBuffers[col - 1].data(),
35113542
fetchBufferSize * sizeof(SQLCHAR),
35123543
buffers.indicators[col - 1].data());
3544+
#else
3545+
// On Windows, the ODBC driver returns bytes in the server's
3546+
// native encoding (e.g., CP1252). Rather than guessing the
3547+
// code page, we request SQL_C_WCHAR so the driver performs
3548+
// the conversion to UTF-16 — exactly matching how NVARCHAR
3549+
// columns are already handled.
3550+
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
3551+
buffers.wcharBuffers[col - 1].resize(fetchSize * fetchBufferSize);
3552+
ret = SQLBindCol_ptr(hStmt, col, SQL_C_WCHAR, buffers.wcharBuffers[col - 1].data(),
3553+
fetchBufferSize * sizeof(SQLWCHAR),
3554+
buffers.indicators[col - 1].data());
3555+
#endif
35133556
break;
35143557
}
35153558
case SQL_WCHAR:
@@ -3675,9 +3718,9 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
36753718
HandleZeroColumnSizeAtFetch(columnInfos[col].processedColumnSize);
36763719
// On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR where
36773720
// each character can be up to 4 bytes. Must match SQLBindColums buffer.
3678-
#if defined(__APPLE__) || defined(__linux__)
36793721
SQLSMALLINT dt = columnInfos[col].dataType;
36803722
bool isCharType = (dt == SQL_CHAR || dt == SQL_VARCHAR || dt == SQL_LONGVARCHAR);
3723+
#if defined(__APPLE__) || defined(__linux__)
36813724
if (isCharType) {
36823725
columnInfos[col].fetchBufferSize = columnInfos[col].processedColumnSize * 4 +
36833726
1; // *4 for UTF-8, +1 for null terminator
@@ -3686,6 +3729,10 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
36863729
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
36873730
}
36883731
#else
3732+
// On Windows, VARCHAR columns are fetched as SQL_C_WCHAR (see
3733+
// SQLBindColums). The fetchBufferSize is in SQLWCHAR elements,
3734+
// matching the wcharBuffers layout.
3735+
(void)isCharType; // same formula for all types on Windows
36893736
columnInfos[col].fetchBufferSize =
36903737
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
36913738
#endif
@@ -3740,7 +3787,14 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
37403787
case SQL_CHAR:
37413788
case SQL_VARCHAR:
37423789
case SQL_LONGVARCHAR:
3790+
#if defined(__APPLE__) || defined(__linux__)
37433791
columnProcessors[col] = ColumnProcessors::ProcessChar;
3792+
#else
3793+
// On Windows, VARCHAR columns are fetched as SQL_C_WCHAR
3794+
// (the driver converts from the server's native encoding to
3795+
// UTF-16), so we reuse the NVARCHAR processor.
3796+
columnProcessors[col] = ColumnProcessors::ProcessWChar;
3797+
#endif
37443798
break;
37453799
case SQL_WCHAR:
37463800
case SQL_WVARCHAR:
@@ -4048,7 +4102,8 @@ size_t calculateRowSize(py::list& columnNames, SQLUSMALLINT numCols) {
40484102
break;
40494103
case SQL_SS_UDT:
40504104
rowSize += (static_cast<SQLLEN>(columnSize) == SQL_NO_TOTAL || columnSize == 0)
4051-
? SQL_MAX_LOB_SIZE : columnSize;
4105+
? SQL_MAX_LOB_SIZE
4106+
: columnSize;
40524107
break;
40534108
case SQL_BINARY:
40544109
case SQL_VARBINARY:
@@ -4112,8 +4167,7 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch
41124167

41134168
if ((dataType == SQL_WVARCHAR || dataType == SQL_WLONGVARCHAR || dataType == SQL_VARCHAR ||
41144169
dataType == SQL_LONGVARCHAR || dataType == SQL_VARBINARY ||
4115-
dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML ||
4116-
dataType == SQL_SS_UDT) &&
4170+
dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML || dataType == SQL_SS_UDT) &&
41174171
(columnSize == 0 || columnSize == SQL_NO_TOTAL || columnSize > SQL_MAX_LOB_SIZE)) {
41184172
lobColumns.push_back(i + 1); // 1-based
41194173
}
@@ -4252,8 +4306,7 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows,
42524306

42534307
if ((dataType == SQL_WVARCHAR || dataType == SQL_WLONGVARCHAR || dataType == SQL_VARCHAR ||
42544308
dataType == SQL_LONGVARCHAR || dataType == SQL_VARBINARY ||
4255-
dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML ||
4256-
dataType == SQL_SS_UDT) &&
4309+
dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML || dataType == SQL_SS_UDT) &&
42574310
(columnSize == 0 || columnSize == SQL_NO_TOTAL || columnSize > SQL_MAX_LOB_SIZE)) {
42584311
lobColumns.push_back(i + 1); // 1-based
42594312
}

0 commit comments

Comments
 (0)