Skip to content

Commit

Permalink
48-interval-schema (#50)
Browse files Browse the repository at this point in the history
* Added schemas for raw and interval userale logs

* Added polymorphic UserAleSchema and some tests

* Small changes based on review

---------

Co-authored-by: jlhitzeman <[email protected]>
Co-authored-by: rthenhaus <rthenhaus>
  • Loading branch information
rc10house and jlhitzeman authored Jul 15, 2024
1 parent c127bc9 commit 5048552
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 56 deletions.
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,20 @@ repos:
- id: end-of-file-fixer
- id: mixed-line-ending
repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
rev: v4.6.0
- repo: https://github.com/commitizen-tools/commitizen
rev: v3.18.0 # automatically updated by Commitizen
rev: v3.27.0 # automatically updated by Commitizen
hooks:
- id: commitizen
stages: [commit-msg]
- hooks:
- id: flake8
repo: https://github.com/pycqa/flake8
rev: 7.0.0
rev: 7.1.0
- hooks:
- id: black
repo: https://github.com/psf/black
rev: 24.2.0
rev: 24.4.2
- hooks:
- args:
- --profile
Expand Down
30 changes: 16 additions & 14 deletions distill/core/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json

from pydantic import BaseModel
from pydantic.type_adapter import TypeAdapter
from typing import Dict, Union

from pksuid import PKSUID
from pydantic import BaseModel, parse_obj_as
from pydantic.type_adapter import TypeAdapter

from distill.core.types import JsonDict, JSONSerializable
from distill.schemas.userale import UserAleSchema
from distill.core.types import JsonDict, JSONSerializable, UserAleSchema

ta = TypeAdapter(JsonDict)


class Log:
"""
Base class for log object representation.
Expand All @@ -34,27 +34,29 @@ class Log:
defaults to UserAle log schema
"""

def __init__(self, data: Union[str, JsonDict], schema=UserAleSchema):
if not issubclass(schema, BaseModel):
def __init__(self, data: Union[str, JsonDict], schema=None):
if schema is None:
schema = UserAleSchema
elif issubclass(schema, BaseModel):
raise TypeError("schema should inherit from pydantic.BaseModel")

if isinstance(data, str):
schema.model_validate_json(data, strict=True)
hash_sfx = str(hash(data))
data = json.loads(data)
elif ta.validate_python(data):
hash_sfx = str(hash(json.dumps(data)))
schema.model_validate(data, strict=True)
else:
raise TypeError("ERROR: " + str(type(data)) + " data should be either a string or a JsonDict")
self.data = schema(**data)

self.id = PKSUID("log_" + hash_sfx, schema._timestamp(self.data))
raise TypeError(
"ERROR: "
+ str(type(data))
+ " data should be either a string or a JsonDict"
)
self.data = schema.validate_python(data)

self.id = PKSUID("log_" + hash_sfx, self.data._timestamp())

def to_json(self) -> str:
return self.data.model_dump_json(by_alias=True)

def to_dict(self) -> JsonDict:
return self.data.model_dump(by_alias=True)

25 changes: 15 additions & 10 deletions distill/core/types.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,27 @@
from typing import Union, List, Dict
from typing import Dict, List, Union

from pydantic.type_adapter import TypeAdapter
from typing_extensions import TypeAliasType

from distill.schemas.userale import UserAleRawSchema, UserAleIntervalSchema

# TypeAliasType is necessary to avoid recursion error when validating this
# type with Pydantic
JSONSerializable = TypeAliasType(
"JSONSerializable",
Union[str,
int,
float,
bool,
None,
List['JSONSerializable'],
Dict[str, 'JSONSerializable']
Union[
str,
int,
float,
bool,
None,
List["JSONSerializable"],
Dict[str, "JSONSerializable"],
],
)

JsonDict = Dict[str, 'JSONSerializable']
JsonDict = Dict[str, "JSONSerializable"]

Timestamp = Union[str, int, float]


UserAleSchema = TypeAdapter(Union[UserAleRawSchema, UserAleIntervalSchema])
89 changes: 69 additions & 20 deletions distill/schemas/userale.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional
from datetime import datetime
from typing import List, Optional

from pydantic import AliasGenerator, BaseModel, Field, field_serializer, field_validator
from pydantic import BaseModel, Field, AliasGenerator, field_serializer, field_validator
from pydantic.alias_generators import to_camel
from pydantic.config import ConfigDict

from .base import BaseSchema
from datetime import datetime

from distill.schemas.base import BaseSchema

class Browser(BaseModel):
browser: str
version: str


class Location(BaseModel):
x: Optional[int]
y: Optional[int]
Expand All @@ -42,38 +39,90 @@ class ScrnRes(BaseModel):
class Details(BaseModel):
window: bool


class UserAleSchema(BaseSchema):
class UserAleBaseSchema(BaseSchema):
"""
A raw or custom log produced by UserAle
"""

model_config = ConfigDict(
title="Log",
alias_generator=AliasGenerator(
validation_alias=to_camel, serialization_alias=to_camel
),
)

target: str
path: List[str]
page_url: str
page_title: str
page_referrer: str
browser: Browser
client_time: int
micro_time: int = Field(..., lt=2)
location: Location
scrn_res: ScrnRes
type_field: str = Field(..., validation_alias="type", serialization_alias="type")
log_type: str
user_action: bool
details: Details
user_id: str
tool_version: Optional[str]
tool_name: Optional[str]
userale_version: Optional[str]
session_id: str
http_session_id: str
browser_session_id: str

def _timestamp(self):
"""
Returns:
float: POSIX time from userALE log's client_time field
"""
pass


class UserAleIntervalSchema(UserAleBaseSchema):
"""
A raw or custom log produced by UserAle
"""

model_config = ConfigDict(
title="IntervalLog",
alias_generator=AliasGenerator(
validation_alias=to_camel, serialization_alias=to_camel
),
)

count: int
duration: int
start_time: int
end_time: int
target_change: bool
type_change: bool

@field_validator("start_time", "end_time")
def validate_st(cls, st: float):
return datetime.fromtimestamp(st / 1000)

@field_serializer("start_time", "end_time")
def serialize_st(self, st: datetime):
return int(st.timestamp() * 1000)

# add in end_time validator and serializer under same tag

def _timestamp(self):
"""
Returns:
float: POSIX time from userALE log's start_time field
"""
return self.start_time.timestamp()


class UserAleRawSchema(UserAleBaseSchema):
"""
A raw or custom log produced by UserAle
"""

model_config = ConfigDict(
title="RawLog",
alias_generator=AliasGenerator(
validation_alias=to_camel, serialization_alias=to_camel
),
)

client_time: int
micro_time: int = Field(..., lt=2)
location: Location
scrn_res: ScrnRes
details: Details

@field_validator("client_time")
def validate_ct(cls, ct: float):
Expand Down
1 change: 1 addition & 0 deletions tests/data/log_interval_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"target": "nav.navigation-bar-desktop", "path": ["nav.navigation-bar-desktop","body.body","html.no-js","#document","Window"], "pageUrl": "https://beam.apache.org/case-studies/", "pageTitle": "Case Studies", "pageReferrer": "https://beam.apache.org/", "browser": {"browser": "chrome","version": "114.0.0"}, "count": 1, "duration": 129, "startTime": 1708447014463, "endTime": 1708447014592, "type": "mouseover", "logType": "interval", "targetChange": true, "typeChange": false, "userAction": false, "userId": "MD", "toolVersion": "", "toolName": "", "useraleVersion": "2.4.0", "sessionId": "session_1708446947239", "httpSessionId": "72798a8ad776417183b1aa14e03c3132", "browserSessionId": "06b0db1ab30e8e92819ba3d4091b83bc"}
2 changes: 1 addition & 1 deletion tests/data/log_test_data.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"target": "#document","path": [ "Window" ], "pageUrl": "https://github.com/apache/flagon/tree/master/docker", "pageTitle": "flagon/docker at master · apache/flagon · GitHub", "pageReferrer": "https://gov.teams.microsoft.us/", "browser": { "browser": "chrome", "version": "116.0.0" }, "clientTime": 1719530111079, "microTime": 0,"location": { "x": null, "y": null }, "scrnRes": { "width": 1349, "height":954 }, "type": "load", "logType": "raw", "userAction": true, "details": {"window": true }, "userId": "nobody", "toolVersion": null, "toolName":"test_app", "useraleVersion": "2.3.0", "sessionId":"session_1719530074303"}
{"target": "#document","path": [ "Window" ], "pageUrl": "https://github.com/apache/flagon/tree/master/docker", "pageTitle": "flagon/docker at master · apache/flagon · GitHub", "pageReferrer": "https://gov.teams.microsoft.us/", "browser": { "browser": "chrome", "version": "116.0.0" }, "clientTime": 1719530111079, "microTime": 0,"location": { "x": null, "y": null }, "scrnRes": { "width": 1349, "height":954 }, "type": "load", "logType": "raw", "userAction": true, "details": {"window": true }, "userId": "nobody", "toolVersion": null, "toolName":"test_app", "useraleVersion": "2.3.0", "sessionId":"session_1719530074303", "httpSessionId": "72798a8ad776417183b1aa14e03c3132", "browserSessionId": "06b0db1ab30e8e92819ba3d4091b83bc"}
30 changes: 23 additions & 7 deletions tests/test_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@

import json
import os
from datetime import datetime

from pydantic import ValidationError

from distill.core.log import Log
from distill.core.log import Log
from tests.data_config import DATA_DIR
from datetime import datetime


def test_log_constructor():
exception_thrown = False
try:
_ = Log(data="garbage data")
_ = Log(data='{"garbage data": "bad"}')
except ValidationError:
exception_thrown = True
assert exception_thrown == True
Expand All @@ -48,14 +48,24 @@ def test_log_constructor():
assert id.get_timestamp() == 1719530111079 // 1000
assert id.prefix.startswith("log_")

data = load_interval_log()
test_interval_log = Log(data=data)
assert test_interval_log is not None
id = test_interval_log.id
assert id.get_timestamp() == 1708447014463 // 1000


def test_log_serialize():
data = load_log()
test_log = Log(data=data)

correct_str = json.dumps(
json.loads(data), separators=(",", ":"), ensure_ascii=False
)
# correct_str = json.dumps(
# json.loads(data), separators=(",", ":"), ensure_ascii=False
# )
# Hardcoding this for now because creating a polymorphic model does not
# preserve order in pydantic. Our data is still correct but not in the
# original order. There doesn't seem to be an easy way to fix this right now
correct_str = '{"target":"#document","path":["Window"],"pageUrl":"https://github.com/apache/flagon/tree/master/docker","pageTitle":"flagon/docker at master · apache/flagon · GitHub","pageReferrer":"https://gov.teams.microsoft.us/","browser":{"browser":"chrome","version":"116.0.0"},"type":"load","logType":"raw","userAction":true,"userId":"nobody","toolVersion":null,"toolName":"test_app","useraleVersion":"2.3.0","sessionId":"session_1719530074303","httpSessionId":"72798a8ad776417183b1aa14e03c3132","browserSessionId":"06b0db1ab30e8e92819ba3d4091b83bc","clientTime":1719530111079,"microTime":0,"location":{"x":null,"y":null},"scrnRes":{"width":1349,"height":954},"details":{"window":true}}'
serialized_data = test_log.to_json()
assert serialized_data == correct_str

Expand All @@ -73,7 +83,7 @@ def test_log_normalize_timestamp():
data = load_log()
test_log = Log(data=data)

# note provided UserAle schema has clientTime in milliseconds but need it in
# note provided UserAle schema has clientTime in milliseconds but need it in
# seconds to be able to parse
correct_ms = 1719530111079
correct_dt = datetime.fromtimestamp(correct_ms / 1000)
Expand All @@ -86,3 +96,9 @@ def load_log() -> str:
with open(os.path.join(DATA_DIR, "log_test_data.json")) as f:
data = f.readline()
return data


def load_interval_log() -> str:
with open(os.path.join(DATA_DIR, "log_interval_data.json")) as f:
data = f.readline()
return data

0 comments on commit 5048552

Please sign in to comment.