Skip to content
12 changes: 11 additions & 1 deletion .github/workflows/build_linux_arm64_wheels-gh.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ jobs:
which clang++-19
clang++-19 --version
sudo apt-get install -y make cmake ccache ninja-build yasm gawk wget
# Install WebAssembly linker (wasm-ld)
# Install WebAssembly linker (wasm-ld)
sudo apt-get install -y lld-19
# Create symlink for wasm-ld
if ! command -v wasm-ld &> /dev/null; then
Expand Down Expand Up @@ -263,7 +263,17 @@ jobs:
pyenv shell $version
python -m pip install dist/*.whl --force-reinstall
python -c "import chdb; res = chdb.query('select 1112222222,555', 'CSV'); print(f'Python $version: {res}')"

# First test: without optional dependencies
echo "Testing without pandas and pyarrow..."
python -m pip uninstall -y pandas pyarrow || true
make test

# Second test: with optional dependencies
echo "Testing with pandas and pyarrow..."
python -m pip install pandas pyarrow
make test

pyenv shell --unset
done
continue-on-error: false
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/build_linux_x86_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,17 @@ jobs:
pyenv shell $version
python -m pip install dist/*.whl --force-reinstall
python -c "import chdb; res = chdb.query('select 1112222222,555', 'CSV'); print(f'Python $version: {res}')"
# First test: without optional dependencies
echo "Testing without pandas and pyarrow..."
python -m pip uninstall -y pandas pyarrow || true
make test
# Second test: with optional dependencies
echo "Testing with pandas and pyarrow..."
python -m pip install pandas pyarrow
make test
pyenv shell --unset
done
continue-on-error: false
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/build_macos_arm64_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,15 @@ jobs:
pyenv shell $version
python -m pip install dist/*.whl --force-reinstall --no-cache-dir
python -c "import chdb; res = chdb.query('select 1112222222,555', 'CSV'); print(f'Python $version: {res}')"

# First test: without optional dependencies
echo "Testing without pandas and pyarrow..."
python -m pip uninstall -y pandas pyarrow || true
make test

# Second test: with optional dependencies
echo "Testing with pandas and pyarrow..."
python -m pip install pandas pyarrow
make test
python -m pip uninstall -y chdb
pyenv shell --unset
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/build_macos_x86_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,17 @@ jobs:
pyenv shell $version
python -m pip install dist/*.whl --force-reinstall
python -c "import chdb; res = chdb.query('select 1112222222,555', 'CSV'); print(f'Python $version: {res}')"

# First test: without optional dependencies
echo "Testing without pandas and pyarrow..."
python -m pip uninstall -y pandas pyarrow || true
make test

# Second test: with optional dependencies
echo "Testing with pandas and pyarrow..."
python -m pip install pandas pyarrow
make test

pyenv shell --unset
done
continue-on-error: false
Expand Down
5 changes: 2 additions & 3 deletions chdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,10 @@ def to_arrowTable(res):
# try import pyarrow and pandas, if failed, raise ImportError with suggestion
try:
import pyarrow as pa # noqa
import pandas as pd # noqa
except ImportError as e:
print(f"ImportError: {e}")
print('Please install pyarrow and pandas via "pip install pyarrow pandas"')
raise ImportError("Failed to import pyarrow or pandas") from None
print('Please install pyarrow via "pip install pyarrow"')
raise ImportError("Failed to import pyarrow") from None
if len(res) == 0:
return pa.Table.from_batches([], schema=pa.schema([]))

Expand Down
9 changes: 7 additions & 2 deletions chdb/dataframe/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,13 @@
import tempfile
from io import BytesIO
import re
import pandas as pd
import pyarrow as pa
try:
import pandas as pd
import pyarrow as pa
except ImportError as e:
print(f'ImportError: {e}')
print('Please install pyarrow and pandas via "pip install pyarrow pandas"')
raise ImportError('Failed to import pyarrow or pandas') from None
from chdb import query as chdb_query


Expand Down
63 changes: 41 additions & 22 deletions chdb/state/sqlitelike.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,32 @@
from typing import Optional, Any
from typing import Optional, Any, TYPE_CHECKING, List, Tuple
from chdb import _chdb

# try import pyarrow if failed, raise ImportError with suggestion
try:
import pyarrow as pa # noqa
except ImportError as e:
print(f"ImportError: {e}")
print('Please install pyarrow via "pip install pyarrow"')
raise ImportError("Failed to import pyarrow") from None
if TYPE_CHECKING:
import pyarrow as pa


def _import_pyarrow():
"""Lazy import pyarrow when needed."""
try:
import pyarrow as pa
return pa
except ImportError:
raise ImportError(
"PyArrow is required for this feature. "
"Install with: pip install pyarrow"
) from None


def _import_pandas():
"""Lazy import pandas when needed."""
try:
import pandas as pd
return pd
except ImportError:
raise ImportError(
"Pandas is required for DataFrame conversion. "
"Install with: pip install pandas"
) from None


_arrow_format = set({"dataframe", "arrowtable"})
Expand All @@ -32,11 +51,11 @@ def to_arrowTable(res):
pyarrow.Table: PyArrow Table containing the query results

Raises:
ImportError: If pyarrow or pandas packages are not installed
ImportError: If pyarrow package is not installed

.. note::
This function requires both pyarrow and pandas to be installed.
Install them with: ``pip install pyarrow pandas``
This function requires pyarrow to be installed.
Install with: ``pip install pyarrow``

.. warning::
Empty results return an empty PyArrow Table with no schema.
Expand All @@ -52,14 +71,7 @@ def to_arrowTable(res):
num text
0 1 hello
"""
# try import pyarrow and pandas, if failed, raise ImportError with suggestion
try:
import pyarrow as pa # noqa
import pandas as pd # noqa
except ImportError as e:
print(f"ImportError: {e}")
print('Please install pyarrow and pandas via "pip install pyarrow pandas"')
raise ImportError("Failed to import pyarrow or pandas") from None
pa = _import_pyarrow()
if len(res) == 0:
return pa.Table.from_batches([], schema=pa.schema([]))

Expand Down Expand Up @@ -104,6 +116,7 @@ def to_df(r):
text object
dtype: object
"""
_import_pandas()
t = to_arrowTable(r)
return t.to_pandas(use_threads=True)

Expand Down Expand Up @@ -232,7 +245,7 @@ def cancel(self):
except Exception as e:
raise RuntimeError(f"Failed to cancel streaming query: {str(e)}") from e

def record_batch(self, rows_per_batch: int = 1000000) -> pa.RecordBatchReader:
def record_batch(self, rows_per_batch: int = 1000000) -> "pa.RecordBatchReader":
"""
Create a PyArrow RecordBatchReader from this StreamingResult.

Expand All @@ -244,17 +257,19 @@ def record_batch(self, rows_per_batch: int = 1000000) -> pa.RecordBatchReader:
rows_per_batch (int): Number of rows per batch. Defaults to 1000000.

Returns:
pa.RecordBatchReader: PyArrow RecordBatchReader for efficient streaming
pyarrow.RecordBatchReader: PyArrow RecordBatchReader for efficient streaming

Raises:
ValueError: If the StreamingResult was not created with arrow format
ImportError: If PyArrow is not installed
"""
if not self._supports_record_batch:
raise ValueError(
"record_batch() can only be used with arrow format. "
"Please use format='Arrow' when calling send_query."
)

pa = _import_pyarrow()
chdb_reader = ChdbRecordBatchReader(self, rows_per_batch)
return pa.RecordBatchReader.from_batches(chdb_reader.schema(), chdb_reader)

Expand All @@ -277,10 +292,12 @@ def __init__(self, chdb_stream_result, batch_size_rows):
self._current_rows = 0
self._first_batch = None
self._first_batch_consumed = True
self._pa = _import_pyarrow()
self._schema = self.schema()

def schema(self):
if self._schema is None:
pa = self._pa
# Get the first chunk to determine schema
chunk = self._stream_result.fetch()
if chunk is not None:
Expand All @@ -306,6 +323,8 @@ def schema(self):
return self._schema

def read_next_batch(self):
pa = self._pa

if self._accumulator:
result = self._accumulator.pop(0)
return result
Expand Down Expand Up @@ -602,7 +621,7 @@ class Cursor:
def __init__(self, connection):
self._conn = connection
self._cursor = self._conn.cursor()
self._current_table: Optional[pa.Table] = None
self._current_table: Optional[List[Tuple]] = None
self._current_row: int = 0

def execute(self, query: str) -> None:
Expand Down
13 changes: 8 additions & 5 deletions programs/local/PythonConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,12 +285,15 @@ void convert_to_json_str(const py::handle & obj, String & ret)
d.SetObject();
rapidjson::Document::AllocatorType & allocator = d.GetAllocator();

auto sys_modules = py::module_::import("sys").attr("modules");
bool has_numpy = sys_modules.contains(py::str("numpy"));

std::function<void(const py::handle &, rapidjson::Value &)> convert;
convert = [&](const py::handle & obj, rapidjson::Value & json_value) {
if (py::isinstance<py::dict>(obj))
{
json_value.SetObject();
for (auto & item : py::cast<py::dict>(obj))
for (const auto & item : py::cast<py::dict>(obj))
{
rapidjson::Value key;
auto key_str = py::str(item.first).cast<std::string>();
Expand All @@ -306,7 +309,7 @@ void convert_to_json_str(const py::handle & obj, String & ret)
{
json_value.SetArray();
auto tmp_list = py::cast<py::list>(obj);
for (auto & item : tmp_list)
for (const auto & item : tmp_list)
{
rapidjson::Value element;
convert(item, element);
Expand All @@ -317,14 +320,14 @@ void convert_to_json_str(const py::handle & obj, String & ret)
{
json_value.SetArray();
auto tmp_tuple = py::cast<py::tuple>(obj);
for (auto & item : tmp_tuple)
for (const auto & item : tmp_tuple)
{
rapidjson::Value element;
convert(item, element);
json_value.PushBack(element, allocator);
}
}
else if (py::isinstance<py::array>(obj))
else if (has_numpy && py::isinstance<py::array>(obj))
{
auto arr = py::cast<py::array>(obj);
json_value.SetArray();
Expand All @@ -337,7 +340,7 @@ void convert_to_json_str(const py::handle & obj, String & ret)
auto item = my_list.attr("__getitem__")(i);
convert(item, element);
json_value.PushBack(element, allocator);
}
}
}
else
{
Expand Down
3 changes: 2 additions & 1 deletion programs/local/PythonTableCache.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "PythonTableCache.h"
#include "PandasDataFrame.h"
#include "PybindWrapper.h"
#include "PythonUtils.h"

Expand Down Expand Up @@ -32,7 +33,7 @@ static py::object findQueryableObj(const String & var_name)
{
// Get the object using Python's indexing syntax
obj = namespace_obj[py::cast(var_name)];
if (DB::isInheritsFromPyReader(obj) || DB::isPandasDf(obj) || DB::isPyarrowTable(obj) || DB::hasGetItem(obj))
if (DB::isInheritsFromPyReader(obj) || PandasDataFrame::isPandasDataframe(obj) || DB::isPyarrowTable(obj) || DB::hasGetItem(obj))
{
return obj;
}
Expand Down
37 changes: 9 additions & 28 deletions programs/local/PythonUtils.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#pragma once

#include "config.h"
#include "PybindWrapper.h"

#include <cstddef>
#include <Columns/ColumnString.h>
#include <Columns/IColumn.h>
#include <DataTypes/Serializations/SerializationNumber.h>
Expand Down Expand Up @@ -68,35 +68,21 @@ inline bool isInheritsFromPyReader(const py::object & obj)
return execWithGIL([&]() { return _isInheritsFromPyReader(obj); });
}

// Helper function to check if object is a pandas DataFrame
inline bool isPandasDf(const py::object & obj)
{
return execWithGIL(
[&]()
{
auto pd_data_frame_type = py::module_::import("pandas").attr("DataFrame");
return py::isinstance(obj, pd_data_frame_type);
});
}

// Helper function to check if object is a PyArrow Table
inline bool isPyarrowTable(const py::object & obj)
{
return execWithGIL(
[&]()
{
auto table_type = py::module_::import("pyarrow").attr("Table");
return py::isinstance(obj, table_type);
});
chassert(py::gil_check());
auto dict = py::module_::import("sys").attr("modules");
if (!dict.contains(py::str("pyarrow")))
return false;

return py::isinstance(obj, py::module_::import("pyarrow").attr("Table"));
}

inline bool hasGetItem(const py::object & obj)
{
return execWithGIL(
[&]()
{
return py::hasattr(obj, "__getitem__");
});
chassert(py::gil_check());
return py::hasattr(obj, "__getitem__");
}

// Specific wrappers for common use cases
Expand All @@ -105,11 +91,6 @@ inline auto castToPyList(const py::object & obj)
return execWithGIL([&]() { return obj.cast<py::list>(); });
}

inline auto castToPyArray(const py::object & obj)
{
return execWithGIL([&]() { return obj.cast<py::array>(); });
}

inline std::string castToStr(const py::object & obj)
{
return execWithGIL([&]() { return py::str(obj).cast<std::string>(); });
Expand Down
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,12 @@ def build_extensions(self):
ext_modules=ext_modules,
python_requires=">=3.8",
install_requires=[
"pyarrow>=13.0.0",
"pandas>=2.0.0",
],
extras_require={
"arrow": ["pandas>=2.0.0", "pyarrow>=13.0.0"],
"pandas": ["pandas>=2.0.0", "pyarrow>=13.0.0"],
"all": ["pandas>=2.0.0", "pyarrow>=13.0.0"],
},
cmdclass={"build_ext": BuildExt},
test_suite="tests",
zip_safe=False,
Expand Down
Loading
Loading