diff --git a/mssql_python/pybind/CMakeLists.txt b/mssql_python/pybind/CMakeLists.txt index 45893318..bcc64762 100644 --- a/mssql_python/pybind/CMakeLists.txt +++ b/mssql_python/pybind/CMakeLists.txt @@ -215,6 +215,22 @@ endif() message(STATUS "Final Python library directory: ${PYTHON_LIB_DIR}") +find_package(simdutf CONFIG QUIET) + +if(NOT simdutf_FOUND) + include(FetchContent) + message(STATUS "simdutf not found via find_package; downloading v8.2.0 source archive with FetchContent") + FetchContent_Declare( + simdutf + URL https://github.com/simdutf/simdutf/archive/refs/tags/v8.2.0.tar.gz + DOWNLOAD_EXTRACT_TIMESTAMP FALSE + ) + set(SIMDUTF_TESTS OFF CACHE BOOL "Disable simdutf tests" FORCE) + set(SIMDUTF_TOOLS OFF CACHE BOOL "Disable simdutf tools" FORCE) + set(SIMDUTF_BENCHMARKS OFF CACHE BOOL "Disable simdutf benchmarks" FORCE) + FetchContent_MakeAvailable(simdutf) +endif() + set(DDBC_SOURCE "ddbc_bindings.cpp") message(STATUS "Using standard source file: ${DDBC_SOURCE}") # Include connection module and logger bridge @@ -293,6 +309,8 @@ else() endif() endif() +target_link_libraries(ddbc_bindings PRIVATE simdutf::simdutf) + # Compiler definitions target_compile_definitions(ddbc_bindings PRIVATE HAVE_SNPRINTF diff --git a/mssql_python/pybind/connection/connection.cpp b/mssql_python/pybind/connection/connection.cpp index aaf172b9..9bd8de9d 100644 --- a/mssql_python/pybind/connection/connection.cpp +++ b/mssql_python/pybind/connection/connection.cpp @@ -45,7 +45,7 @@ static SqlHandlePtr getEnvHandle() { // This class wraps low-level ODBC operations like connect/disconnect, // transaction control, and autocommit configuration. //------------------------------------------------------------------------------------------------- -Connection::Connection(const std::wstring& conn_str, bool use_pool) +Connection::Connection(const std::u16string& conn_str, bool use_pool) : _connStr(conn_str), _autocommit(false), _fromPool(use_pool) { allocateDbcHandle(); } @@ -74,17 +74,7 @@ void Connection::connect(const py::dict& attrs_before) { setAutocommit(_autocommit); } } - SQLWCHAR* connStrPtr; -#if defined(__APPLE__) || defined(__linux__) // macOS/Linux handling - LOG("Creating connection string buffer for macOS/Linux"); - std::vector connStrBuffer = WStringToSQLWCHAR(_connStr); - // Ensure the buffer is null-terminated - LOG("Connection string buffer size=%zu", connStrBuffer.size()); - connStrPtr = connStrBuffer.data(); - LOG("Connection string buffer created"); -#else - connStrPtr = const_cast(_connStr.c_str()); -#endif + SQLWCHAR* connStrPtr = const_cast(reinterpretU16stringAsSqlWChar(_connStr)); SQLRETURN ret; { // Release the GIL during the blocking ODBC connect call. @@ -180,8 +170,7 @@ void Connection::disconnect() { void Connection::checkError(SQLRETURN ret) const { if (!SQL_SUCCEEDED(ret)) { ErrorInfo err = SQLCheckError_Wrap(SQL_HANDLE_DBC, _dbcHandle, ret); - std::string errorMsg = WideToUTF8(err.ddbcErrorMsg); - ThrowStdException(errorMsg); + ThrowStdException(err.ddbcErrorMsg); } } @@ -298,39 +287,13 @@ SQLRETURN Connection::setAttribute(SQLINTEGER attribute, py::object value) { return ret; } else if (py::isinstance(value)) { try { - std::string utf8_str = value.cast(); - - // Convert to wide string - std::wstring wstr = Utf8ToWString(utf8_str); - if (wstr.empty() && !utf8_str.empty()) { - LOG("Failed to convert string value to wide string for " - "attribute=%d", - attribute); - return SQL_ERROR; - } - this->wstrStringBuffer.clear(); - this->wstrStringBuffer = std::move(wstr); + this->wstrStringBuffer = value.cast(); SQLPOINTER ptr; SQLINTEGER length; - -#if defined(__APPLE__) || defined(__linux__) - // For macOS/Linux, convert wstring to SQLWCHAR buffer - std::vector sqlwcharBuffer = WStringToSQLWCHAR(this->wstrStringBuffer); - if (sqlwcharBuffer.empty() && !this->wstrStringBuffer.empty()) { - LOG("Failed to convert wide string to SQLWCHAR buffer for " - "attribute=%d", - attribute); - return SQL_ERROR; - } - - ptr = sqlwcharBuffer.data(); - length = static_cast(sqlwcharBuffer.size() * sizeof(SQLWCHAR)); -#else - // On Windows, wchar_t and SQLWCHAR are the same size - ptr = const_cast(this->wstrStringBuffer.c_str()); + + ptr = const_cast(reinterpretU16stringAsSqlWChar(this->wstrStringBuffer)); length = static_cast(this->wstrStringBuffer.length() * sizeof(SQLWCHAR)); -#endif SQLRETURN ret = SQLSetConnectAttr_ptr(_dbcHandle->get(), attribute, ptr, length); if (!SQL_SUCCEEDED(ret)) { @@ -432,10 +395,9 @@ std::chrono::steady_clock::time_point Connection::lastUsed() const { return _lastUsed; } -ConnectionHandle::ConnectionHandle(const std::string& connStr, bool usePool, +ConnectionHandle::ConnectionHandle(const std::u16string& connStr, bool usePool, const py::dict& attrsBefore) - : _usePool(usePool) { - _connStr = Utf8ToWString(connStr); + : _usePool(usePool), _connStr(connStr) { if (_usePool) { _conn = ConnectionPoolManager::getInstance().acquireConnection(_connStr, attrsBefore); } else { @@ -576,9 +538,7 @@ void ConnectionHandle::setAttr(int attribute, py::object value) { std::string errorMsg = "Failed to set connection attribute " + std::to_string(attribute); if (!errorInfo.ddbcErrorMsg.empty()) { - // Convert wstring to string for concatenation - std::string ddbcErrorStr = WideToUTF8(errorInfo.ddbcErrorMsg); - errorMsg += ": " + ddbcErrorStr; + errorMsg += ": " + errorInfo.ddbcErrorMsg; } LOG("Connection setAttribute failed: %s", errorMsg.c_str()); diff --git a/mssql_python/pybind/connection/connection.h b/mssql_python/pybind/connection/connection.h index 6c6f1e63..76e47323 100644 --- a/mssql_python/pybind/connection/connection.h +++ b/mssql_python/pybind/connection/connection.h @@ -21,7 +21,7 @@ class Connection { public: - Connection(const std::wstring& connStr, bool fromPool); + Connection(const std::u16string& connStr, bool fromPool); ~Connection(); @@ -63,12 +63,12 @@ class Connection { void checkError(SQLRETURN ret) const; void applyAttrsBefore(const py::dict& attrs_before); - std::wstring _connStr; + std::u16string _connStr; bool _fromPool = false; bool _autocommit = true; SqlHandlePtr _dbcHandle; std::chrono::steady_clock::time_point _lastUsed; - std::wstring wstrStringBuffer; // wstr buffer for string attribute setting + std::u16string wstrStringBuffer; // UTF-16 buffer for wide ODBC attributes std::string strBytesBuffer; // string buffer for byte attributes setting // Track child statement handles to mark them as implicitly freed when connection closes @@ -90,7 +90,7 @@ class Connection { class ConnectionHandle { public: - ConnectionHandle(const std::string& connStr, bool usePool, + ConnectionHandle(const std::u16string& connStr, bool usePool, const py::dict& attrsBefore = py::dict()); ~ConnectionHandle(); @@ -108,5 +108,5 @@ class ConnectionHandle { private: std::shared_ptr _conn; bool _usePool; - std::wstring _connStr; + std::u16string _connStr; }; diff --git a/mssql_python/pybind/connection/connection_pool.cpp b/mssql_python/pybind/connection/connection_pool.cpp index 7c6b7f70..c95dc8f9 100644 --- a/mssql_python/pybind/connection/connection_pool.cpp +++ b/mssql_python/pybind/connection/connection_pool.cpp @@ -12,7 +12,7 @@ ConnectionPool::ConnectionPool(size_t max_size, int idle_timeout_secs) : _max_size(max_size), _idle_timeout_secs(idle_timeout_secs), _current_size(0) {} -std::shared_ptr ConnectionPool::acquire(const std::wstring& connStr, +std::shared_ptr ConnectionPool::acquire(const std::u16string& connStr, const py::dict& attrs_before) { std::vector> to_disconnect; std::shared_ptr valid_conn = nullptr; @@ -145,7 +145,7 @@ ConnectionPoolManager& ConnectionPoolManager::getInstance() { return manager; } -std::shared_ptr ConnectionPoolManager::acquireConnection(const std::wstring& connStr, +std::shared_ptr ConnectionPoolManager::acquireConnection(const std::u16string& connStr, const py::dict& attrs_before) { std::shared_ptr pool; { @@ -163,7 +163,7 @@ std::shared_ptr ConnectionPoolManager::acquireConnection(const std:: return pool->acquire(connStr, attrs_before); } -void ConnectionPoolManager::returnConnection(const std::wstring& conn_str, +void ConnectionPoolManager::returnConnection(const std::u16string& conn_str, const std::shared_ptr conn) { std::shared_ptr pool; { diff --git a/mssql_python/pybind/connection/connection_pool.h b/mssql_python/pybind/connection/connection_pool.h index 7a8a98c5..f4d9e0f4 100644 --- a/mssql_python/pybind/connection/connection_pool.h +++ b/mssql_python/pybind/connection/connection_pool.h @@ -20,7 +20,7 @@ class ConnectionPool { ConnectionPool(size_t max_size, int idle_timeout_secs); // Acquires a connection from the pool or creates a new one if under limit - std::shared_ptr acquire(const std::wstring& connStr, + std::shared_ptr acquire(const std::u16string& connStr, const py::dict& attrs_before = py::dict()); // Returns a connection to the pool for reuse @@ -46,11 +46,11 @@ class ConnectionPoolManager { void configure(int max_size, int idle_timeout); // Gets a connection from the appropriate pool (creates one if none exists) - std::shared_ptr acquireConnection(const std::wstring& conn_str, + std::shared_ptr acquireConnection(const std::u16string& conn_str, const py::dict& attrs_before = py::dict()); // Returns a connection to its original pool - void returnConnection(const std::wstring& conn_str, std::shared_ptr conn); + void returnConnection(const std::u16string& conn_str, std::shared_ptr conn); // Closes all pools and their connections void closePools(); @@ -60,7 +60,7 @@ class ConnectionPoolManager { ~ConnectionPoolManager() = default; // Map from connection string to connection pool - std::unordered_map> _pools; + std::unordered_map> _pools; // Protects access to the _pools map std::mutex _manager_mutex; diff --git a/mssql_python/pybind/ddbc_bindings.cpp b/mssql_python/pybind/ddbc_bindings.cpp index c7537cbe..5a95f18b 100644 --- a/mssql_python/pybind/ddbc_bindings.cpp +++ b/mssql_python/pybind/ddbc_bindings.cpp @@ -572,14 +572,11 @@ SQLRETURN BindParameters(SQLHANDLE hStmt, const py::list& params, bufferLength = 0; } else { // Normal small-string case - std::wstring* strParam = - AllocateParamBuffer(paramBuffers, param.cast()); + std::u16string* sqlwcharBuffer = AllocateParamBuffer( + paramBuffers, param.cast()); LOG("BindParameters: param[%d] SQL_C_WCHAR - String " "length=%zu characters, buffer=%zu bytes", - paramIndex, strParam->size(), strParam->size() * sizeof(SQLWCHAR)); - std::vector* sqlwcharBuffer = - AllocateParamBuffer>(paramBuffers, - WStringToSQLWCHAR(*strParam)); + paramIndex, sqlwcharBuffer->size(), sqlwcharBuffer->size() * sizeof(SQLWCHAR)); dataPtr = sqlwcharBuffer->data(); bufferLength = sqlwcharBuffer->size() * sizeof(SQLWCHAR); strLenOrIndPtr = AllocateParamBuffer(paramBuffers); @@ -1562,7 +1559,7 @@ ErrorInfo SQLCheckError_Wrap(SQLSMALLINT handleType, SqlHandlePtr handle, SQLRET ErrorInfo errorInfo; if (retcode == SQL_INVALID_HANDLE) { LOG("SQLCheckError: SQL_INVALID_HANDLE detected - handle is invalid"); - errorInfo.ddbcErrorMsg = std::wstring(L"Invalid handle!"); + errorInfo.ddbcErrorMsg = "Invalid handle!"; return errorInfo; } assert(handle != 0); @@ -1582,16 +1579,11 @@ ErrorInfo SQLCheckError_Wrap(SQLSMALLINT handleType, SqlHandlePtr handle, SQLRET message, SQL_MAX_MESSAGE_LENGTH, &messageLen); if (SQL_SUCCEEDED(diagReturn)) { -#if defined(_WIN32) - // On Windows, SQLWCHAR and wchar_t are compatible - errorInfo.sqlState = std::wstring(sqlState); - errorInfo.ddbcErrorMsg = std::wstring(message); -#else - // On macOS/Linux, need to convert SQLWCHAR (usually unsigned short) - // to wchar_t - errorInfo.sqlState = SQLWCHARToWString(sqlState); - errorInfo.ddbcErrorMsg = SQLWCHARToWString(message, messageLen); -#endif + std::u16string sqlStateUtf16 = dupeSqlWCharAsUtf16Le(sqlState, 5); + std::u16string messageUtf16 = dupeSqlWCharAsUtf16Le(message, static_cast(messageLen)); + + errorInfo.sqlState = utf16LeToUtf8Alloc(std::move(sqlStateUtf16)); + errorInfo.ddbcErrorMsg = utf16LeToUtf8Alloc(std::move(messageUtf16)); } } return errorInfo; @@ -1625,44 +1617,24 @@ py::list SQLGetAllDiagRecords(SqlHandlePtr handle) { if (diagReturn == SQL_NO_DATA || !SQL_SUCCEEDED(diagReturn)) break; -#if defined(_WIN32) - // On Windows, create a formatted UTF-8 string for state+error + std::u16string sqlStateUtf16 = dupeSqlWCharAsUtf16Le(sqlState, 5); + std::u16string messageUtf16 = dupeSqlWCharAsUtf16Le(message, static_cast(messageLen)); - // Convert SQLWCHAR sqlState to UTF-8 - int stateSize = WideCharToMultiByte(CP_UTF8, 0, sqlState, -1, NULL, 0, NULL, NULL); - std::vector stateBuffer(stateSize); - WideCharToMultiByte(CP_UTF8, 0, sqlState, -1, stateBuffer.data(), stateSize, NULL, NULL); - - // Format the state with error code - std::string stateWithError = - "[" + std::string(stateBuffer.data()) + "] (" + std::to_string(nativeError) + ")"; - - // Convert wide string message to UTF-8 - int msgSize = WideCharToMultiByte(CP_UTF8, 0, message, -1, NULL, 0, NULL, NULL); - std::vector msgBuffer(msgSize); - WideCharToMultiByte(CP_UTF8, 0, message, -1, msgBuffer.data(), msgSize, NULL, NULL); - - // Create the tuple with converted strings - records.append(py::make_tuple(py::str(stateWithError), py::str(msgBuffer.data()))); -#else - // On Unix, use the SQLWCHARToWString utility and then convert to UTF-8 - std::string stateStr = WideToUTF8(SQLWCHARToWString(sqlState)); - std::string msgStr = WideToUTF8(SQLWCHARToWString(message, messageLen)); + std::string stateStr = utf16LeToUtf8Alloc(std::move(sqlStateUtf16)); + std::string msgStr = utf16LeToUtf8Alloc(std::move(messageUtf16)); // Format the state string std::string stateWithError = "[" + stateStr + "] (" + std::to_string(nativeError) + ")"; // Create the tuple with converted strings records.append(py::make_tuple(py::str(stateWithError), py::str(msgStr))); -#endif } return records; } // Wrap SQLExecDirect -SQLRETURN SQLExecDirect_wrap(SqlHandlePtr StatementHandle, const std::wstring& Query) { - std::string queryUtf8 = WideToUTF8(Query); +SQLRETURN SQLExecDirect_wrap(SqlHandlePtr StatementHandle, const std::u16string& Query) { LOG("SQLExecDirect: Executing query directly - statement_handle=%p, " "query_length=%zu chars", (void*)StatementHandle->get(), Query.length()); @@ -1679,13 +1651,7 @@ SQLRETURN SQLExecDirect_wrap(SqlHandlePtr StatementHandle, const std::wstring& Q (SQLPOINTER)SQL_CONCUR_READ_ONLY, 0); } - SQLWCHAR* queryPtr; -#if defined(__APPLE__) || defined(__linux__) - std::vector queryBuffer = WStringToSQLWCHAR(Query); - queryPtr = queryBuffer.data(); -#else - queryPtr = const_cast(Query.c_str()); -#endif + SQLWCHAR* queryPtr = const_cast(reinterpretU16stringAsSqlWChar(Query)); SQLRETURN ret = SQLExecDirect_ptr(StatementHandle->get(), queryPtr, SQL_NTS); if (!SQL_SUCCEEDED(ret)) { LOG("SQLExecDirect: Query execution failed - SQLRETURN=%d", ret); @@ -1772,7 +1738,7 @@ SQLRETURN SQLTables_wrap(SqlHandlePtr StatementHandle, const std::wstring& catal // directly. 'usePrepare' parameter can be used to disable the prepare step for // queries that might already be prepared in a previous call. SQLRETURN SQLExecute_wrap(const SqlHandlePtr statementHandle, - const std::wstring& query /* TODO: Use SQLTCHAR? */, + const std::u16string& query, const py::list& params, std::vector& paramInfos, py::list& isStmtPrepared, const bool usePrepare, const py::dict& encodingSettings) { @@ -1804,13 +1770,7 @@ SQLRETURN SQLExecute_wrap(const SqlHandlePtr statementHandle, SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_CONCURRENCY, (SQLPOINTER)SQL_CONCUR_READ_ONLY, 0); } - SQLWCHAR* queryPtr; -#if defined(__APPLE__) || defined(__linux__) - std::vector queryBuffer = WStringToSQLWCHAR(query); - queryPtr = queryBuffer.data(); -#else - queryPtr = const_cast(query.c_str()); -#endif + SQLWCHAR* queryPtr = const_cast(reinterpretU16stringAsSqlWChar(query)); if (params.size() == 0) { // Execute statement directly if the statement is not parametrized. This // is the fastest way to submit a SQL statement for one-time execution @@ -1886,17 +1846,9 @@ SQLRETURN SQLExecute_wrap(const SqlHandlePtr statementHandle, } if (py::isinstance(pyObj)) { if (matchedInfo->paramCType == SQL_C_WCHAR) { - std::wstring wstr = pyObj.cast(); - const SQLWCHAR* dataPtr = nullptr; - size_t totalChars = 0; -#if defined(__APPLE__) || defined(__linux__) - std::vector sqlwStr = WStringToSQLWCHAR(wstr); - totalChars = sqlwStr.size() - 1; - dataPtr = sqlwStr.data(); -#else - dataPtr = wstr.c_str(); - totalChars = wstr.size(); -#endif + std::u16string utf16 = pyObj.cast(); + size_t totalChars = utf16.size(); + const SQLWCHAR* dataPtr = reinterpretU16stringAsSqlWChar(utf16); size_t offset = 0; size_t chunkChars = DAE_CHUNK_SIZE / sizeof(SQLWCHAR); while (offset < totalChars) { @@ -2087,42 +2039,16 @@ SQLRETURN BindParameterArray(SQLHANDLE hStmt, const py::list& columnwise_params, std::memset(wcharArray + i * (info.columnSize + 1), 0, (info.columnSize + 1) * sizeof(SQLWCHAR)); } else { - std::wstring wstr = columnValues[i].cast(); -#if defined(__APPLE__) || defined(__linux__) - // Convert to UTF-16 first, then check the actual - // UTF-16 length - auto utf16Buf = WStringToSQLWCHAR(wstr); - size_t utf16_len = utf16Buf.size() > 0 ? utf16Buf.size() - 1 : 0; - // Check UTF-16 length (excluding null terminator) - // against column size - if (utf16Buf.size() > 0 && utf16_len > info.columnSize) { - std::string offending = WideToUTF8(wstr); - LOG("BindParameterArray: SQL_C_WCHAR string " - "too long - param_index=%d, row=%zu, " - "utf16_length=%zu, max=%zu", - paramIndex, i, utf16_len, info.columnSize); - ThrowStdException("Input string UTF-16 length exceeds " - "allowed column size at parameter index " + - std::to_string(paramIndex) + ". UTF-16 length: " + - std::to_string(utf16_len) + ", Column size: " + - std::to_string(info.columnSize)); - } - // If we reach here, the UTF-16 string fits - copy - // it completely - std::memcpy(wcharArray + i * (info.columnSize + 1), utf16Buf.data(), - utf16Buf.size() * sizeof(SQLWCHAR)); -#else - // On Windows, wchar_t is already UTF-16, so the + std::u16string wstr = columnValues[i].cast(); + // u16string is already UTF-16, so the // original check is sufficient if (wstr.length() > info.columnSize) { - std::string offending = WideToUTF8(wstr); ThrowStdException("Input string exceeds allowed column size " "at parameter index " + std::to_string(paramIndex)); } std::memcpy(wcharArray + i * (info.columnSize + 1), wstr.c_str(), (wstr.length() + 1) * sizeof(SQLWCHAR)); -#endif strLenOrIndArray[i] = SQL_NTS; } } @@ -2672,7 +2598,7 @@ SQLRETURN BindParameterArray(SQLHANDLE hStmt, const py::list& columnwise_params, return SQL_SUCCESS; } -SQLRETURN SQLExecuteMany_wrap(const SqlHandlePtr statementHandle, const std::wstring& query, +SQLRETURN SQLExecuteMany_wrap(const SqlHandlePtr statementHandle, const std::u16string& query, const py::list& columnwise_params, const std::vector& paramInfos, size_t paramSetSize, const py::dict& encodingSettings) { @@ -2680,16 +2606,8 @@ SQLRETURN SQLExecuteMany_wrap(const SqlHandlePtr statementHandle, const std::wst "param_set_size=%zu", columnwise_params.size(), paramSetSize); SQLHANDLE hStmt = statementHandle->get(); - SQLWCHAR* queryPtr; - -#if defined(__APPLE__) || defined(__linux__) - std::vector queryBuffer = WStringToSQLWCHAR(query); - queryPtr = queryBuffer.data(); - LOG("SQLExecuteMany: Query converted to SQLWCHAR - buffer_size=%zu", queryBuffer.size()); -#else - queryPtr = const_cast(query.c_str()); + SQLWCHAR* queryPtr = const_cast(reinterpretU16stringAsSqlWChar(query)); LOG("SQLExecuteMany: Using wide string query directly"); -#endif RETCODE rc = SQLPrepare_ptr(hStmt, queryPtr, SQL_NTS); if (!SQL_SUCCEEDED(rc)) { LOG("SQLExecuteMany: SQLPrepare failed - rc=%d", rc); @@ -2868,11 +2786,7 @@ SQLRETURN SQLDescribeCol_wrap(SqlHandlePtr StatementHandle, py::list& ColumnMeta if (SQL_SUCCEEDED(retcode)) { // Append a named py::dict to ColumnMetadata // TODO: Should we define a struct for this task instead of dict? -#if defined(__APPLE__) || defined(__linux__) - ColumnMetadata.append(py::dict("ColumnName"_a = SQLWCHARToWString(ColumnName, SQL_NTS), -#else - ColumnMetadata.append(py::dict("ColumnName"_a = std::wstring(ColumnName), -#endif + ColumnMetadata.append(py::dict("ColumnName"_a = dupeSqlWCharAsUtf16Le(ColumnName, static_cast(NameLength)), "DataType"_a = DataType, "ColumnSize"_a = ColumnSize, "DecimalDigits"_a = DecimalDigits, "Nullable"_a = Nullable)); @@ -3013,22 +2927,12 @@ py::object FetchLobColumnData(SQLHSTMT hStmt, SQLUSMALLINT colIndex, SQLSMALLINT return py::str(""); } if (isWideChar) { -#if defined(_WIN32) - size_t wcharCount = buffer.size() / sizeof(wchar_t); - std::vector alignedBuf(wcharCount); - std::memcpy(alignedBuf.data(), buffer.data(), buffer.size()); - std::wstring wstr(alignedBuf.data(), wcharCount); - std::string utf8str = WideToUTF8(wstr); - return py::str(utf8str); -#else - // Linux/macOS handling size_t wcharCount = buffer.size() / sizeof(SQLWCHAR); std::vector alignedBuf(wcharCount); std::memcpy(alignedBuf.data(), buffer.data(), buffer.size()); - std::wstring wstr = SQLWCHARToWString(alignedBuf.data(), wcharCount); - std::string utf8str = WideToUTF8(wstr); + std::u16string utf16 = dupeSqlWCharAsUtf16Le(alignedBuf.data(), wcharCount); + std::string utf8str = utf16LeToUtf8Alloc(std::move(utf16)); return py::str(utf8str); -#endif } if (isBinary) { LOG("FetchLobColumnData: Returning binary data - %zu bytes for column " @@ -3306,15 +3210,9 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p if (dataLen > 0) { uint64_t numCharsInData = dataLen / sizeof(SQLWCHAR); if (numCharsInData < dataBuffer.size()) { -#if defined(__APPLE__) || defined(__linux__) - std::wstring wstr = - SQLWCHARToWString(dataBuffer.data(), numCharsInData); - std::string utf8str = WideToUTF8(wstr); + std::u16string utf16 = dupeSqlWCharAsUtf16Le(dataBuffer.data(), numCharsInData); + std::string utf8str = utf16LeToUtf8Alloc(std::move(utf16)); row.append(py::str(utf8str)); -#else - std::wstring wstr(reinterpret_cast(dataBuffer.data())); - row.append(py::cast(wstr)); -#endif LOG("SQLGetData: Appended NVARCHAR string " "length=%lu for column %d", (unsigned long)numCharsInData, i); @@ -5098,23 +4996,20 @@ SQLRETURN FetchArrowBatch_wrap( auto wcharSource = &buffers.wcharBuffers[idxCol][idxRowSql * (columnSize + 1)]; auto start = arrowColumnProducer->varVal[idxRowArrow]; auto target_vec = &arrowColumnProducer->varData; -#if defined(_WIN32) - // Convert wide string - int dataLenConverted = WideCharToMultiByte(CP_UTF8, 0, wcharSource, static_cast(dataLenW), NULL, 0, NULL, NULL); - while (target_vec->size() < start + dataLenConverted) { - target_vec->resize(target_vec->size() * 2); - } - WideCharToMultiByte(CP_UTF8, 0, wcharSource, static_cast(dataLenW), reinterpret_cast(&(*target_vec)[start]), dataLenConverted, NULL, NULL); - arrowColumnProducer->varVal[idxRowArrow + 1] = start + dataLenConverted; -#else - // On Unix, use the SQLWCHARToWString utility and then convert to UTF-8 - std::string utf8str = WideToUTF8(SQLWCHARToWString(wcharSource, dataLenW)); - while (target_vec->size() < start + utf8str.size()) { + static_assert(sizeof(SQLWCHAR) == sizeof(char16_t)); + static_assert(alignof(SQLWCHAR) == alignof(char16_t)); + const auto* utf16Source = reinterpret_cast(wcharSource); + size_t maxUtf8Size = dataLenW * 3; + + while (target_vec->size() < start + maxUtf8Size) { target_vec->resize(target_vec->size() * 2); } - std::memcpy(&(*target_vec)[start], utf8str.data(), utf8str.size()); - arrowColumnProducer->varVal[idxRowArrow + 1] = start + utf8str.size(); -#endif + + size_t bytesWritten = simdutf::convert_utf16le_to_utf8_with_replacement( + utf16Source, dataLenW, + reinterpret_cast(target_vec->data() + start)); + + arrowColumnProducer->varVal[idxRowArrow + 1] = start + bytesWritten; break; } case SQL_GUID: { @@ -5743,7 +5638,7 @@ PYBIND11_MODULE(ddbc_bindings, m) { .def("free", &SqlHandle::free, "Free the handle"); py::class_(m, "Connection") - .def(py::init(), py::arg("conn_str"), + .def(py::init(), py::arg("conn_str"), py::arg("use_pool"), py::arg("attrs_before") = py::dict()) .def("close", &ConnectionHandle::close, "Close the connection") .def("commit", &ConnectionHandle::commit, "Commit the current transaction") diff --git a/mssql_python/pybind/ddbc_bindings.h b/mssql_python/pybind/ddbc_bindings.h index 8c6cfed9..b0d00aac 100644 --- a/mssql_python/pybind/ddbc_bindings.h +++ b/mssql_python/pybind/ddbc_bindings.h @@ -14,6 +14,8 @@ #include #include // Add this line for datetime support #include +#include +#include #include #include @@ -42,20 +44,6 @@ inline std::vector WStringToSQLWCHAR(const std::wstring& str) { result.push_back(0); return result; } - -inline std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) { - if (!sqlwStr) - return std::wstring(); - - if (length == SQL_NTS) { - size_t i = 0; - while (sqlwStr[i] != 0) - ++i; - length = i; - } - return std::wstring(reinterpret_cast(sqlwStr), length); -} - #endif #if defined(__APPLE__) || defined(__linux__) @@ -76,59 +64,6 @@ inline bool IsValidUnicodeScalar(uint32_t cp) { !(cp >= UNICODE_SURROGATE_HIGH_START && cp <= UNICODE_SURROGATE_LOW_END); } -inline std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) { - if (!sqlwStr) - return std::wstring(); - if (length == SQL_NTS) { - size_t i = 0; - while (sqlwStr[i] != 0) - ++i; - length = i; - } - std::wstring result; - result.reserve(length); - if constexpr (sizeof(SQLWCHAR) == 2) { - // Use a manual increment to handle skipping - for (size_t i = 0; i < length;) { - uint16_t wc = static_cast(sqlwStr[i]); - // Check for high surrogate and valid low surrogate - if (wc >= UNICODE_SURROGATE_HIGH_START && wc <= UNICODE_SURROGATE_HIGH_END && - (i + 1 < length)) { - uint16_t low = static_cast(sqlwStr[i + 1]); - if (low >= UNICODE_SURROGATE_LOW_START && low <= UNICODE_SURROGATE_LOW_END) { - // Combine into a single code point - uint32_t cp = (((wc - UNICODE_SURROGATE_HIGH_START) << 10) | - (low - UNICODE_SURROGATE_LOW_START)) + - 0x10000; - result.push_back(static_cast(cp)); - i += 2; // Move past both surrogates - continue; - } - } - // If we reach here, it's not a valid surrogate pair or is a BMP - // character. Check if it's a valid scalar and append, otherwise - // append replacement char. - if (IsValidUnicodeScalar(wc)) { - result.push_back(static_cast(wc)); - } else { - result.push_back(static_cast(UNICODE_REPLACEMENT_CHAR)); - } - ++i; // Move to the next code unit - } - } else { - // SQLWCHAR is UTF-32, so just copy with validation - for (size_t i = 0; i < length; ++i) { - uint32_t cp = static_cast(sqlwStr[i]); - if (IsValidUnicodeScalar(cp)) { - result.push_back(static_cast(cp)); - } else { - result.push_back(static_cast(UNICODE_REPLACEMENT_CHAR)); - } - } - } - return result; -} - inline std::vector WStringToSQLWCHAR(const std::wstring& str) { std::vector result; result.reserve(str.size() + 2); @@ -167,9 +102,36 @@ inline std::vector WStringToSQLWCHAR(const std::wstring& str) { } #endif -#if defined(__APPLE__) || defined(__linux__) -#include "unix_utils.h" // Unix-specific fixes -#endif +inline std::string utf16LeToUtf8Alloc(std::u16string utf16) { + if (utf16.empty()) { + return {}; + } + + simdutf::result utf8Length = + simdutf::utf8_length_from_utf16le_with_replacement(utf16.data(), utf16.size()); + std::string utf8(utf8Length.count, '\0'); + utf8.resize( + simdutf::convert_utf16le_to_utf8_with_replacement(utf16.data(), utf16.size(), utf8.data())); + return utf8; +} + +inline std::u16string dupeSqlWCharAsUtf16Le(const SQLWCHAR* value, size_t length) { + std::u16string utf16(length, u'\0'); + static_assert(sizeof(SQLWCHAR) == sizeof(char16_t), "SQLWCHAR must be 16-bit"); + + if (length > 0) { + std::memcpy(utf16.data(), value, length * sizeof(SQLWCHAR)); + } + return utf16; +} + +inline const SQLWCHAR* reinterpretU16stringAsSqlWChar(const std::u16string& utf16) { + static_assert(sizeof(std::u16string::value_type) == sizeof(SQLWCHAR), + "SQLWCHAR must same as u16string"); + static_assert(alignof(std::u16string::value_type) == alignof(SQLWCHAR), + "SQLWCHAR must same as u16string"); + return reinterpret_cast(utf16.c_str()); +} //------------------------------------------------------------------------------------------------- // Function pointer typedefs @@ -403,172 +365,11 @@ using SqlHandlePtr = std::shared_ptr; // This struct is used to relay error info obtained from SQLDiagRec API to the // Python module struct ErrorInfo { - std::wstring sqlState; - std::wstring ddbcErrorMsg; + std::string sqlState; + std::string ddbcErrorMsg; }; ErrorInfo SQLCheckError_Wrap(SQLSMALLINT handleType, SqlHandlePtr handle, SQLRETURN retcode); -inline std::string WideToUTF8(const std::wstring& wstr) { - if (wstr.empty()) - return {}; - -#if defined(_WIN32) - int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr.data(), static_cast(wstr.size()), - nullptr, 0, nullptr, nullptr); - if (size_needed == 0) - return {}; - std::string result(size_needed, 0); - int converted = WideCharToMultiByte(CP_UTF8, 0, wstr.data(), static_cast(wstr.size()), - result.data(), size_needed, nullptr, nullptr); - if (converted == 0) - return {}; - return result; -#else - // Manual UTF-32 to UTF-8 conversion for macOS/Linux - std::string utf8_string; - // Reserve enough space for worst case (4 bytes per character) - utf8_string.reserve(wstr.size() * 4); - - for (wchar_t wc : wstr) { - uint32_t code_point = static_cast(wc); - - if (code_point <= 0x7F) { - // 1-byte UTF-8 sequence for ASCII characters - utf8_string += static_cast(code_point); - } else if (code_point <= 0x7FF) { - // 2-byte UTF-8 sequence - utf8_string += static_cast(0xC0 | ((code_point >> 6) & 0x1F)); - utf8_string += static_cast(0x80 | (code_point & 0x3F)); - } else if (code_point <= 0xFFFF) { - // 3-byte UTF-8 sequence - utf8_string += static_cast(0xE0 | ((code_point >> 12) & 0x0F)); - utf8_string += static_cast(0x80 | ((code_point >> 6) & 0x3F)); - utf8_string += static_cast(0x80 | (code_point & 0x3F)); - } else if (code_point <= 0x10FFFF) { - // 4-byte UTF-8 sequence for characters like emojis (e.g., U+1F604) - utf8_string += static_cast(0xF0 | ((code_point >> 18) & 0x07)); - utf8_string += static_cast(0x80 | ((code_point >> 12) & 0x3F)); - utf8_string += static_cast(0x80 | ((code_point >> 6) & 0x3F)); - utf8_string += static_cast(0x80 | (code_point & 0x3F)); - } - } - return utf8_string; -#endif -} - -inline std::wstring Utf8ToWString(const std::string& str) { - if (str.empty()) - return {}; -#if defined(_WIN32) - int size_needed = - MultiByteToWideChar(CP_UTF8, 0, str.data(), static_cast(str.size()), nullptr, 0); - if (size_needed == 0) { - LOG_ERROR("MultiByteToWideChar failed for UTF8 to wide string conversion"); - return {}; - } - std::wstring result(size_needed, 0); - int converted = MultiByteToWideChar(CP_UTF8, 0, str.data(), static_cast(str.size()), - result.data(), size_needed); - if (converted == 0) - return {}; - return result; -#else - // Optimized UTF-8 to UTF-32 conversion (wstring on Unix) - - // Lambda to decode UTF-8 multi-byte sequences - auto decodeUtf8 = [](const unsigned char* data, size_t& i, size_t len) -> wchar_t { - unsigned char byte = data[i]; - - // 1-byte sequence (ASCII): 0xxxxxxx - if (byte <= 0x7F) { - ++i; - return static_cast(byte); - } - // 2-byte sequence: 110xxxxx 10xxxxxx - if ((byte & 0xE0) == 0xC0 && i + 1 < len) { - // Validate continuation byte has correct bit pattern (10xxxxxx) - if ((data[i + 1] & 0xC0) != 0x80) { - ++i; - return 0xFFFD; // Invalid continuation byte - } - uint32_t cp = ((static_cast(byte & 0x1F) << 6) | (data[i + 1] & 0x3F)); - // Reject overlong encodings (must be >= 0x80) - if (cp >= 0x80) { - i += 2; - return static_cast(cp); - } - // Overlong encoding - invalid - ++i; - return 0xFFFD; - } - // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx - if ((byte & 0xF0) == 0xE0 && i + 2 < len) { - // Validate continuation bytes have correct bit pattern (10xxxxxx) - if ((data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80) { - ++i; - return 0xFFFD; // Invalid continuation bytes - } - uint32_t cp = ((static_cast(byte & 0x0F) << 12) | - ((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F)); - // Reject overlong encodings (must be >= 0x800) and surrogates (0xD800-0xDFFF) - if (cp >= 0x800 && (cp < 0xD800 || cp > 0xDFFF)) { - i += 3; - return static_cast(cp); - } - // Overlong encoding or surrogate - invalid - ++i; - return 0xFFFD; - } - // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - if ((byte & 0xF8) == 0xF0 && i + 3 < len) { - // Validate continuation bytes have correct bit pattern (10xxxxxx) - if ((data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80 || - (data[i + 3] & 0xC0) != 0x80) { - ++i; - return 0xFFFD; // Invalid continuation bytes - } - uint32_t cp = - ((static_cast(byte & 0x07) << 18) | ((data[i + 1] & 0x3F) << 12) | - ((data[i + 2] & 0x3F) << 6) | (data[i + 3] & 0x3F)); - // Reject overlong encodings (must be >= 0x10000) and values above max Unicode - if (cp >= 0x10000 && cp <= 0x10FFFF) { - i += 4; - return static_cast(cp); - } - // Overlong encoding or out of range - invalid - ++i; - return 0xFFFD; - } - // Invalid sequence - skip byte - ++i; - return 0xFFFD; // Unicode replacement character - }; - - std::wstring result; - result.reserve(str.size()); // Reserve assuming mostly ASCII - - const unsigned char* data = reinterpret_cast(str.data()); - const size_t len = str.size(); - size_t i = 0; - - // Fast path for ASCII-only prefix (most common case) - while (i < len && data[i] <= 0x7F) { - result.push_back(static_cast(data[i])); - ++i; - } - - // Handle remaining multi-byte sequences - while (i < len) { - wchar_t wc = decodeUtf8(data, i, len); - // Always push the decoded character (including 0xFFFD replacement characters) - // This correctly handles both legitimate 0xFFFD in input and invalid sequences - result.push_back(wc); - } - - return result; -#endif -} - // Thread-safe decimal separator accessor class class ThreadSafeDecimalSeparator { private: diff --git a/mssql_python/pybind/unix_utils.cpp b/mssql_python/pybind/unix_utils.cpp index c4756286..587cfae8 100644 --- a/mssql_python/pybind/unix_utils.cpp +++ b/mssql_python/pybind/unix_utils.cpp @@ -22,72 +22,6 @@ constexpr uint32_t kUnicodeMaxCodePoint = 0x10FFFF; const char* kOdbcEncoding = "utf-16-le"; // ODBC uses UTF-16LE for SQLWCHAR const size_t kUcsLength = 2; // SQLWCHAR is 2 bytes on all platforms -// Function to convert SQLWCHAR strings to std::wstring on macOS -// THREAD-SAFE: Uses thread_local converter to avoid std::wstring_convert race conditions -std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) { - if (!sqlwStr) { - return std::wstring(); - } - - // Lambda to calculate string length using pointer arithmetic - auto calculateLength = [](const SQLWCHAR* str) -> size_t { - const SQLWCHAR* p = str; - while (*p) - ++p; - return p - str; - }; - - if (length == SQL_NTS) { - length = calculateLength(sqlwStr); - } - - if (length == 0) { - return std::wstring(); - } - - // Lambda to check if character is in Basic Multilingual Plane - auto isBMP = [](uint16_t ch) { return ch < 0xD800 || ch > 0xDFFF; }; - - // Lambda to decode surrogate pair into code point - auto decodeSurrogatePair = [](uint16_t high, uint16_t low) -> uint32_t { - return 0x10000 + (static_cast(high & 0x3FF) << 10) + (low & 0x3FF); - }; - - // Convert UTF-16 to UTF-32 directly without intermediate buffer - std::wstring result; - result.reserve(length); // Reserve assuming most chars are BMP - - size_t i = 0; - while (i < length) { - uint16_t utf16Char = static_cast(sqlwStr[i]); - - // Fast path: BMP character (most common - ~99% of strings) - if (isBMP(utf16Char)) { - result.push_back(static_cast(utf16Char)); - ++i; - } - // Handle surrogate pairs for characters outside BMP - else if (utf16Char <= 0xDBFF) { // High surrogate - if (i + 1 < length) { - uint16_t lowSurrogate = static_cast(sqlwStr[i + 1]); - if (lowSurrogate >= 0xDC00 && lowSurrogate <= 0xDFFF) { - uint32_t codePoint = decodeSurrogatePair(utf16Char, lowSurrogate); - result.push_back(static_cast(codePoint)); - i += 2; - continue; - } - } - // Invalid surrogate - replace with Unicode replacement character - result.push_back(static_cast(kUnicodeReplacementChar)); - ++i; - } else { // Low surrogate without high - invalid, replace with replacement character - result.push_back(static_cast(kUnicodeReplacementChar)); - ++i; - } - } - return result; -} - // Function to convert std::wstring to SQLWCHAR array on macOS/Linux // Converts UTF-32 (wstring on Unix) to UTF-16 (SQLWCHAR) // Invalid Unicode scalars (surrogates, values > 0x10FFFF) are replaced with U+FFFD diff --git a/mssql_python/pybind/unix_utils.h b/mssql_python/pybind/unix_utils.h index ff528759..62c0b24d 100644 --- a/mssql_python/pybind/unix_utils.h +++ b/mssql_python/pybind/unix_utils.h @@ -23,10 +23,6 @@ namespace py = pybind11; extern const char* kOdbcEncoding; // ODBC uses UTF-16LE for SQLWCHAR extern const size_t kUcsLength; // SQLWCHAR is 2 bytes on all platforms -// Function to convert SQLWCHAR strings to std::wstring on macOS -// Removed default argument to avoid redefinition conflict -std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length); - // Function to convert std::wstring to SQLWCHAR array on macOS std::vector WStringToSQLWCHAR(const std::wstring& str);