Skip to content

Commit 931be04

Browse files
committed
🚧 feat(wip): document conversion jobs via object store and kv claims on items in os
1 parent b03e2ae commit 931be04

File tree

5 files changed

+198
-18
lines changed

5 files changed

+198
-18
lines changed

examples/2_object_create_jobs.ipynb

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from magnet.base import Magnet\n",
10+
"\n",
11+
"config = {\n",
12+
" \"host\": \"localhost\",\n",
13+
" \"credentials\": None,\n",
14+
" \"domain\": None,\n",
15+
" \"stream_name\": \"my_stream\",\n",
16+
" \"category\": \"my_category\",\n",
17+
" \"kv_name\": \"my_kv\",\n",
18+
" \"session\": \"my_session\",\n",
19+
" \"os_name\": \"my_object_store\",\n",
20+
" \"index\": {\n",
21+
" \"milvus_uri\": \"127.0.0.1\",\n",
22+
" \"milvus_port\": 19530,\n",
23+
" \"milvus_user\": \"test\",\n",
24+
" \"milvus_password\": \"test\",\n",
25+
" \"dimension\": 1024,\n",
26+
" \"model\": \"BAAI/bge-large-en-v1.5\",\n",
27+
" \"name\": \"test\",\n",
28+
" \"options\": {\n",
29+
" 'metric_type': 'COSINE',\n",
30+
" 'index_type':'HNSW',\n",
31+
" 'params': {\n",
32+
" \"efConstruction\": 40\n",
33+
" , \"M\": 48\n",
34+
" }\n",
35+
" }\n",
36+
" }\n",
37+
"}\n",
38+
"\n",
39+
"magnet = Magnet(config)\n",
40+
"await magnet.align()"
41+
]
42+
},
43+
{
44+
"cell_type": "code",
45+
"execution_count": null,
46+
"metadata": {},
47+
"outputs": [],
48+
"source": [
49+
"from magnet.ic.field import Charge\n",
50+
"\n",
51+
"field = Charge(magnet)\n",
52+
"await field.on()"
53+
]
54+
},
55+
{
56+
"cell_type": "code",
57+
"execution_count": null,
58+
"metadata": {},
59+
"outputs": [],
60+
"source": [
61+
"# simple data pipelines with predictable outcomes\n",
62+
"from magnet.utils.data_classes import FilePayload\n",
63+
"import base64\n",
64+
"import os\n",
65+
"\n",
66+
"for file in os.listdir('./FinPDF/'):\n",
67+
" if file.endswith('.pdf'):\n",
68+
" with open(f\"./FinPDF/{file}\", \"rb\") as fh:\n",
69+
" content = fh.read()\n",
70+
" encoded_content = base64.b64encode(content).decode('utf-8')\n",
71+
" await field.pulse(\n",
72+
" FilePayload(encoded_content, file.split('/')[-1])\n",
73+
" , create_job=True\n",
74+
" , v=True\n",
75+
" )"
76+
]
77+
}
78+
],
79+
"metadata": {
80+
"language_info": {
81+
"name": "python"
82+
}
83+
},
84+
"nbformat": 4,
85+
"nbformat_minor": 2
86+
}

examples/2_object_fulfill_jobs.ipynb

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from magnet.base import Magnet\n",
10+
"\n",
11+
"config = {\n",
12+
" \"host\": \"localhost\",\n",
13+
" \"credentials\": None,\n",
14+
" \"domain\": None,\n",
15+
" \"stream_name\": \"my_stream\",\n",
16+
" \"category\": \"my_category\",\n",
17+
" \"kv_name\": \"my_kv\",\n",
18+
" \"session\": \"my_session\",\n",
19+
" \"os_name\": \"my_object_store\",\n",
20+
" \"index\": {\n",
21+
" \"milvus_uri\": \"127.0.0.1\",\n",
22+
" \"milvus_port\": 19530,\n",
23+
" \"milvus_user\": \"test\",\n",
24+
" \"milvus_password\": \"test\",\n",
25+
" \"dimension\": 1024,\n",
26+
" \"model\": \"BAAI/bge-large-en-v1.5\",\n",
27+
" \"name\": \"test\",\n",
28+
" \"options\": {\n",
29+
" 'metric_type': 'COSINE',\n",
30+
" 'index_type':'HNSW',\n",
31+
" 'params': {\n",
32+
" \"efConstruction\": 40\n",
33+
" , \"M\": 48\n",
34+
" }\n",
35+
" }\n",
36+
" }\n",
37+
"}\n",
38+
"\n",
39+
"magnet = Magnet(config)\n",
40+
"await magnet.align()"
41+
]
42+
},
43+
{
44+
"cell_type": "code",
45+
"execution_count": null,
46+
"metadata": {},
47+
"outputs": [],
48+
"source": [
49+
"from magnet.ic.field import Resonator, Charge\n",
50+
"from magnet.utils.data_classes import Payload\n",
51+
"\n",
52+
"from tractor_beam.utils.file_handlers import PDFProcessor\n",
53+
"\n",
54+
"import json\n",
55+
"\n",
56+
"reso, field = Resonator(magnet), Charge(magnet)\n",
57+
"processor = PDFProcessor()\n",
58+
"models = processor.load_models()\n",
59+
"\n",
60+
"await reso.on(obj=True)\n",
61+
"\n",
62+
"async def handle_jobs(payload, msg):\n",
63+
" msg['_isClaimed'] = True\n",
64+
" await magnet.kv.update(msg['_id'], json.dumps(msg).encode('utf-8'), payload.revision)\n",
65+
" objects = await magnet.os.list()\n",
66+
" for obj in objects:\n",
67+
" if obj.name == msg['_id']:\n",
68+
" await reso.download(obj)\n",
69+
" file = await processor.export_to_markdown(f\"./{obj.name}.{obj.headers['ext']}\", f\"./\", obj.name, models)\n",
70+
" with open(f\"{file}/{file.split('/')[-1]}.md\", \"r\") as md:\n",
71+
" content = md.read()\n",
72+
" await field.pulse(Payload(content, file.split('/')[-1]), v=True)\n",
73+
"\n",
74+
"worker = await reso.worker(cb=handle_jobs)"
75+
]
76+
}
77+
],
78+
"metadata": {
79+
"language_info": {
80+
"name": "python"
81+
}
82+
},
83+
"nbformat": 4,
84+
"nbformat_minor": 2
85+
}

magnet/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def __init__(self, config: MagnetConfig | dict = None):
4444
config = MagnetConfig(**config)
4545
if isinstance(config.index, dict):
4646
config.index = IndexConfig(**config.index)
47-
elif not isinstance(config, MagnetConfig):
47+
else:
4848
_f("fatal", "config must be a MagnetConfig instance or a dictionary")
4949
raise ValueError
5050
except Exception as e:

magnet/ic/field.py

+20-17
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ async def off(self):
8888
await self.magnet.nc.close()
8989
_f('warn', f'disconnected from {self.magnet.config.host}')
9090

91-
async def pulse(self, payload: Payload | FilePayload | GeneratedPayload | EmbeddingPayload | JobParams = None, v=False):
91+
async def pulse(self, payload: Payload | FilePayload | GeneratedPayload | EmbeddingPayload | JobParams = None, create_job=False, v=False):
9292
"""
9393
Publishes data to the NATS server using the specified category and payload.
9494
@@ -107,6 +107,11 @@ async def pulse(self, payload: Payload | FilePayload | GeneratedPayload | Embedd
107107
bucket = await self.magnet.js.object_store(bucket_name)
108108
await bucket.put(object_name, payload_data_bytes, meta=meta)
109109
_f('success', f'uploaded to NATS object store in bucket {bucket_name} as {object_name}') if v else None
110+
111+
if create_job:
112+
job = Job("process_document", _hash)
113+
await self.magnet.kv.put(key=job._id, value=json.dumps(asdict(job)).encode('utf-8'))
114+
_f('info', f'created job {job._id}')
110115
else:
111116
try:
112117
bytes_ = json.dumps(asdict(payload), separators=(
@@ -210,9 +215,7 @@ async def on(self, job: bool = None, local: bool = False, bandwidth: int = 1000,
210215
_f('wait', f'connecting to {self.magnet.config.host}')
211216
try:
212217
if obj:
213-
object_store = await self.magnet.js.object_store(self.magnet.config.os_name)
214-
self.object_store = object_store
215-
self.sub = await object_store.watch(include_history=False)
218+
self.sub = await self.magnet.os.watch(include_history=False)
216219
_f('info',
217220
f'subscribed to object store: {self.magnet.config.os_name} as {self.node}')
218221
else:
@@ -228,9 +231,9 @@ async def on(self, job: bool = None, local: bool = False, bandwidth: int = 1000,
228231
return _f('fatal', e)
229232

230233
async def download(self, obj: object = None):
231-
if obj and self.object_store:
234+
if obj and self.magnet.os:
232235
buffer = io.BytesIO()
233-
file = await self.object_store.get(obj.name, buffer)
236+
file = await self.magnet.os.get(obj.name, buffer)
234237
buffer.seek(0)
235238
chunk_size = 128 * 1024
236239
with open(f"{file.info.name}.{file.info.headers['ext']}", 'wb') as fh:
@@ -263,10 +266,10 @@ async def deliver_messages(msgs):
263266
try:
264267
if type(self.sub).__name__ == "ObjectWatcher":
265268
_f("info", f'consuming objects from [{self.magnet.config.host.split("@")[1]}] from\n🛰️ bucket: {self.magnet.config.os_name}"')
266-
msgs = await self.object_store.list()
269+
msgs = await self.magnet.os.list()
267270
for msg in msgs:
268271
await self.download(msg)
269-
await cb(self.object_store, msg)
272+
await cb(self.magnet.os, msg)
270273
else:
271274
_f("info", f'consuming {job_n} from [{self.magnet.config.category}] on\n🛰️ stream: {self.magnet.config.stream_name}\n🧲 session: "{self.magnet.session}"')
272275
msgs = await self.sub.fetch(batch=job_n, timeout=60)
@@ -280,7 +283,7 @@ async def deliver_messages(msgs):
280283
_f("info", f'consuming objects from [{self.magnet.config.host.split("@")[1]}] from\n🛰️ bucket: {self.magnet.config.stream_name}"')
281284
e = await self.sub.updates()
282285
loop = asyncio.get_event_loop()
283-
loop.create_task(cb(self.object_store, e))
286+
loop.create_task(cb(self.magnet.os, e))
284287
await asyncio.sleep(1)
285288
else:
286289
while True:
@@ -317,16 +320,16 @@ async def worker(self, cb=print):
317320
Exception: If there is an error in consuming the message or processing the callback function.
318321
"""
319322
_f("info",
320-
f'processing jobs from [{self.magnet.config.category}] on\n🛰️ stream: {self.magnet.config.stream_name}\n🧲 session: "{self.magnet.session}"')
323+
f'processing jobs from [{self.magnet.config.kv_name}] on\n🛰️ object store: {self.magnet.config.os_name}')
321324
try:
322-
msg = await self.sub.next_msg(timeout=60)
323-
payload = JobParams(**json.loads(msg.data))
324-
try:
325-
await cb(payload, msg)
326-
except Exception as e:
327-
_f("warn", f'something wrong in your callback function!\n{e}')
325+
keys = await self.magnet.kv.keys()
326+
for key in keys:
327+
_job = await self.magnet.kv.get(key)
328+
job = json.loads(_job.value.decode('utf-8'))
329+
if not job['_isClaimed']:
330+
await cb(_job, job)
328331
except Exception as e:
329-
_f('fatal', 'invalid JSON')
332+
_f('fatal', f'invalid JSON\n{e}')
330333

331334
async def conduct(self, cb=print):
332335
pass

magnet/utils/data_classes.py

+6
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
from dataclasses import dataclass, field
55
from typing import List, Optional, Callable
66

7+
@dataclass
8+
class Job:
9+
_type: str
10+
_id: str
11+
_isClaimed: bool = False
12+
713
@dataclass
814
class AskParameters:
915
m: str = "mistralai/Mistral-7B-Instruct-v0.1"

0 commit comments

Comments
 (0)