Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Idea: Adding the possibility to create a Dataframe from a KQL query #684

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "detection-rules"]
path = detection-rules
url = https://github.com/elastic/detection-rules
1 change: 1 addition & 0 deletions detection-rules
Submodule detection-rules added at c3ca01
40 changes: 40 additions & 0 deletions eland/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import numpy as np
import pandas as pd # type: ignore
from kql import to_dsl
from pandas.core.common import apply_if_callable, is_bool_indexer # type: ignore
from pandas.core.computation.eval import eval # type: ignore
from pandas.core.dtypes.common import is_list_like # type: ignore
Expand Down Expand Up @@ -790,6 +791,45 @@ def es_query(self, query) -> "DataFrame":
query = query["query"]
return DataFrame(_query_compiler=self._query_compiler.es_query(query))

def kql_query(self, query):
"""Applies a Kibana Query Language query (KQL / Kuery) to the DataFrame.
The query is converted to Elasticsearch DSL.
Note that KQL can only be used to filter data.

Parameters
----------
query:
KQL query as a string

Returns
-------
eland.DataFrame:
eland DataFrame with the query applied

Examples
--------

Apply a filtering to the flights dataset, flights from Milan that take longer than 12 hours.

>>> columns = ["OriginCityName", "DestCityName", "FlightTimeHour", "AvgTicketPrice"]
>>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=columns)
>>> df.kql_query('OriginCityName:Milan and FlightTimeHour > 12').head(10)
OriginCityName DestCityName FlightTimeHour AvgTicketPrice
468 Milan Chitose / Tomakomai 15.499956 646.162588
471 Milan Tokyo 14.805640 933.586896
726 Milan Sydney 14.552814 574.534422
886 Milan Buenos Aires 16.938412 748.639741
1097 Milan Sydney 25.469282 913.483049
1708 Milan Buenos Aires 17.718905 975.483549
1865 Milan Sydney 20.014910 956.411751
2410 Milan Melbourne 15.988900 650.720199
2463 Milan Sydney 13.166832 344.815508
2487 Milan Buenos Aires 14.916122 684.506066
<BLANKLINE>
[10 rows x 4 columns]
"""
return self.es_query(to_dsl(query))

def _index_summary(self):
# Print index summary e.g.
# Index: 103 entries, 0 to 102
Expand Down
1 change: 1 addition & 0 deletions kql
1 change: 1 addition & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def lint(session):
# https://numpy.org/devdocs/reference/typing.html#mypy-plugin
session.install("black", "flake8", "mypy", "isort", "numpy")
session.install(".")
session.install("git+https://github.com/elastic/detection-rules")
session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES)
session.run("black", "--check", "--target-version=py38", *SOURCE_FILES)
session.run("isort", "--check", "--profile=black", *SOURCE_FILES)
Expand Down
4 changes: 3 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,6 @@ nox
numpydoc>=0.9.0
mypy
build
twine
twine

git+https://github.com/elastic/detection-rules
24 changes: 22 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,19 +84,39 @@
license="Apache-2.0",
classifiers=CLASSIFIERS,
keywords="elastic eland pandas python",
packages=find_packages(include=["eland", "eland.*"]),
packages=find_packages(include=["eland", "eland.*", "kql"]),
install_requires=[
"elasticsearch>=8.3,<9",
"pandas>=1.5,<2",
"matplotlib>=3.6",
"numpy>=1.2.0,<2",
"packaging",
"Click~=8.1.0",
"eql==0.9.19",
"jsl==0.2.4",
"jsonschema>=3.2.0",
"marko==2.0.1",
"marshmallow-dataclass[union]~=8.5.12",
"marshmallow-jsonschema~=0.12.0",
"marshmallow-union~=0.1.15",
"marshmallow~=3.13.0",
"pywin32 ; platform_system=='Windows'",
"pytoml==0.1.21",
"PyYAML~=5.3 ; python_version<='3.9'",
"PyYAML~=6.0.1 ; python_version>='3.10'",
"requests~=2.27",
"toml==0.10.0",
"typing-inspect==0.8.0",
"typing-extensions==4.5.0 ; python_version<='3.11'",
"typing-extensions==4.8.0 ; python_version>='3.12'",
"XlsxWriter~=1.3.6",
"semver==3.0.0-dev.4",
],
entry_points={
"console_scripts": "eland_import_hub_model=eland.cli.eland_import_hub_model:main"
},
python_requires=">=3.8,<3.12",
package_data={"eland": ["py.typed"]},
package_data={"eland": ["py.typed"], "kql": ["*.g"]},
include_package_data=True,
zip_safe=False,
extras_require=extras,
Expand Down
36 changes: 36 additions & 0 deletions tests/dataframe/test_kql_query_pytest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# File called _pytest for PyCharm compatability

from tests.common import TestData, assert_eland_frame_equal


class TestDataKQLQuery(TestData):
def test_flights_match_query(self):
ed_flights = self.ed_flights()

left = ed_flights.kql_query("OriginCityName:Rome")[
ed_flights["Carrier"] == "Kibana Airlines"
]

right = ed_flights[ed_flights["Carrier"] == "Kibana Airlines"].kql_query(
"OriginCityName:Rome"
)

assert len(left) > 0
assert_eland_frame_equal(left, right)