Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
apmapmapm committed Jul 16, 2024
1 parent b7da8d2 commit c10fcf7
Showing 1 changed file with 71 additions and 3 deletions.
74 changes: 71 additions & 3 deletions notebook_filter_duplicate_muuid.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"id": "9c599d5c-925e-40e1-aac8-ec59048106b2",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -34,7 +34,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"id": "482d3b50-3ffc-453a-8c45-99af16359f17",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -208,7 +208,75 @@
"id": "546514f7-7566-441b-baeb-9dbab3b7d425",
"metadata": {},
"outputs": [],
"source": []
"source": [
"%%time\n",
"collection = muuid\n",
"\n",
"# Calculate the start date for 2 months ago\n",
"for i in range(0,4):\n",
" start_date = datetime.datetime.now() - datetime.timedelta(days=(i+1)*40) #150 to \n",
" end_date = datetime.datetime.now() - datetime.timedelta(days=i*40)\n",
" \n",
" current_date = start_date\n",
" \n",
" would_delete = []\n",
" \n",
" while current_date < end_date:\n",
" next_date = current_date + datetime.timedelta(days=40)\n",
" \n",
" # Group documents by unique fields excluding _id and date\n",
" pipeline = [\n",
" {\n",
" '$match': {\n",
" 'date': {'$gte': current_date, '$lt': next_date}\n",
" }\n",
" },\n",
" {\n",
" '$group': {\n",
" '_id': {\n",
" 'musername': '$musername',\n",
" 'muuid': '$muuid',\n",
" 'musid': '$musid',\n",
" 'con_address': '$con_address',\n",
" 'color': '$color',\n",
" 'servername': '$servername'\n",
" },\n",
" 'latest': {'$max': '$date'},\n",
" 'docs': {'$push': {'_id': '$_id', 'date': '$date'}}\n",
" }\n",
" }\n",
" ]\n",
" \n",
" # Execute the aggregation pipeline\n",
" result = collection.aggregate(pipeline)\n",
" \n",
" # Loop through the aggregation result\n",
" keep=0\n",
" for doc in result:\n",
" latest_date = doc['latest']\n",
" docs = doc['docs']\n",
" latest_id = None\n",
" \n",
" # Find the document with the latest date\n",
" for d in docs:\n",
" if d['date'] == latest_date:\n",
" latest_id = d['_id']\n",
" \n",
" # Print all other documents except the latest one\n",
" if latest_id:\n",
" keep+=1\n",
" for d in docs:\n",
" if d['_id'] != latest_id:\n",
" #print(f\"Would delete document: {d['_id']}\")\n",
" would_delete.append(d['_id'])\n",
" print(\"docs\",len(would_delete),\"keep\",keep,next_date)\n",
" \n",
" current_date = next_date\n",
" \n",
" print(\"Run complete!\")\n",
" res = muuid.delete_many({'_id': {'$in': would_delete}})\n",
" print(\"delted\",res.deleted_count)"
]
}
],
"metadata": {
Expand Down

0 comments on commit c10fcf7

Please sign in to comment.