-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patha-note-on-data-mining.html
105 lines (87 loc) · 20.4 KB
/
a-note-on-data-mining.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
<!DOCTYPE html>
<html><head>
<title>A note on data mining</title>
<base href="./">
<meta id="root-path" root-path="./">
<link rel="icon" sizes="96x96" href="https://publish-01.obsidian.md/access/f786db9fac45774fa4f0d8112e232d67/favicon-96x96.png">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes, minimum-scale=1.0, maximum-scale=5.0">
<meta charset="UTF-8">
<script src="https://code.iconify.design/iconify-icon/1.0.3/iconify-icon.min.js"></script>
<link rel="stylesheet" href="lib/styles/obsidian-styles.css">
<link rel="stylesheet" href="lib/styles/theme.css">
<link rel="stylesheet" href="lib/styles/plugin-styles.css">
<link rel="stylesheet" href="lib/styles/snippets.css">
<style> </style>
<!-- Graph View Data -->
<script>
let nodes=
{"nodeCount":34,"linkCount":41,"radii":[7,7,7,6.595041322314049,6.595041322314049,6.2541322314049586,5.809917355371901,5.809917355371901,5.809917355371901,5.2623966942148765,5.2623966942148765,5.2623966942148765,4.6115702479338845,4.6115702479338845,4.6115702479338845,3.857438016528926,3.857438016528926,3.857438016528926,3.857438016528926,3.857438016528926,3.857438016528926,3.857438016528926,3,3,3,3,3,3,3,3,3,3,3,3],"labels":["Day 2","Day 1","Datasets","Day 5","Day 3","Syllabus","Mapping Homelessness in San Diego","Example 1","Day 4","Analyzing Places Data","San Diego Regional Data Library data example","National Survey of Child Health","In-school suspensions and academic achievement","Fatal police shootings","Add Health","A note on data mining","Geographic differences in discharge dispositions","Housing and Transportation Affordability Index","Household Pulse Survey Data Tables","Final web scrap medium","Consumer Complaints","Add-on Packages & Modules","Research Reminders","Note taking","Lit Reviews","Daily Note","Example Template","Research Notes","research notes","Research Paper","Research Ideas","focus_group","Cheat Sheet","Highlighting structure"],"paths":["day-2.html","day-1.html","datasets.html","day-5.html","day-3.html","syllabus.html","mapping-homelessness-in-san-diego.html","example-1.html","day-4.html","analyzing-places-data.html","san-diego-regional-data-library-data-example.html","national-survey-of-child-health.html","in-school-suspensions-and-academic-achievement.html","fatal-police-shootings.html","add-health.html","a-note-on-data-mining.html","geographic-differences-in-discharge-dispositions.html","housing-and-transportation-affordability-index.html","household-pulse-survey-data-tables.html","final-web-scrap-medium.html","consumer-complaints.html","add-on-packages-&-modules.html","00-meta/003-tools/research-reminders.html","00-meta/003-tools/note-taking.html","00-meta/003-tools/lit-reviews.html","00-meta/002-templates/daily-note.html","00-meta/002-templates/example-template.html","00-meta/002-templates/research-notes.html","00-meta/002-templates/meta/zdc-templates/research-notes.html","00-meta/002-templates/meta/zdc-templates/research-paper.html","00-meta/002-templates/meta/zdc-templates/research-ideas.html","00-meta/002-templates/meta/zdc-templates/focus_group.html","00-meta/001-structure/cheat-sheet.html","00-meta/001-structure/highlighting-structure.html"],"linkSources":[2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,7,7,10,5,5,5,5,5,1,1,1,1,1,0,0,0,4,4,4,4,8,6,6,6,12,9],"linkTargets":[0,3,14,20,13,17,9,11,18,15,10,14,1,0,4,8,16,8,1,0,4,8,3,21,19,7,2,12,9,7,11,0,1,11,13,6,10,0,1,0,2]};
let attractionForce = 1;
let linkLength = 10;
let repulsionForce = 150;
let centralForce = 3;
let edgePruning = 100;
</script>
<script type="module" src="lib/scripts/graph_view.js"></script>
<script src="lib/scripts/graph_wasm.js"></script>
<script src="lib/scripts/tinycolor.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pixi.js/7.2.4/pixi.min.js" integrity="sha512-Ch/O6kL8BqUwAfCF7Ie5SX1Hin+BJgYH4pNjRqXdTEqMsis1TUYg+j6nnI9uduPjGaj7DN4UKCZgpvoExt6dkw==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
<script src="lib/scripts/webpage.js"></script>
</head><body class="theme-dark mod-windows is-frameless is-hidden-frameless obsidian-app show-inline-title show-view-header outliner-plugin-better-lists outliner-plugin-dnd" style="--zoom-factor:1.2; --font-text-size:16px; --line-width:50em; --line-width-adaptive:50em; --file-line-width:50em; --content-width:500em; --sidebar-width:25em; --collapse-arrow-size:0.4em; --tree-horizontal-spacing:1em; --tree-vertical-spacing:0.5em; --sidebar-margin:12px;"><div class="webpage-container"><div class="sidebar-left sidebar"><div class="sidebar-content"><div><label class="theme-toggle-container" for="theme_toggle"><input class="theme-toggle-input" type="checkbox" id="theme_toggle"><div class="toggle-background"></div></label></div><div class="tree-container file-tree mod-nav-indicator" data-depth="0"><div class="tree-header"><span class="sidebar-section-header">Bootcamp</span><button class="clickable-icon collapse-tree-button is-collapsed"><iconify-icon icon="ph:arrows-in-line-horizontal-bold" width="18px" height="18px" rotate="90deg" color="currentColor"></iconify-icon></button></div><div class="tree-scroll-area"></div></div></div></div><div class="document-container"><div class="markdown-preview-view markdown-rendered node-insert-event allow-fold-headings show-indentation-guide allow-fold-lists" tabindex="-1" style="tab-size: 4;"><style id="MJX-CHTML-styles"></style><div class="markdown-preview-sizer markdown-preview-section" style="padding-bottom: ; padding-top: var(--file-margins); padding-right: var(--file-margins); padding-left: var(--file-margins); width: 100%; position: absolute;"><div class="markdown-preview-pusher" style="width: 1px; height: 0.1px; margin-bottom: 0px;"></div><div class="mod-header"><div class="inline-title" contenteditable="true" spellcheck="true" autocapitalize="on" tabindex="-1" enterkeyhint="done">A note on data mining</div></div><div><h1 data-heading="Introduction" id="Introduction">Introduction</h1></div><div><h4 data-heading="Let's go back to the crime data from the city of Los Angeles" id="Let's_go_back_to_the_crime_data_from_the_city_of_Los_Angeles" style="display: flex;"><div class="heading-collapse-indicator collapse-indicator collapse-icon"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="svg-icon right-triangle"><path d="M3 8L12 17L21 8"></path></svg></div>Let's go back to the crime data from the city of Los Angeles</h4></div><div><ul class="has-list-bullet">
<li data-line="0"><div class="list-bullet"></div>You notice that the data contains a lot of fields, and it is not obvious what they represent</li>
<li data-line="1"><div class="list-bullet"></div>I used the dataset for this paper <a data-tooltip-position="top" aria-label="https://www.sciencedirect.com/science/article/pii/S0145213420303951?casa_token=xhGbUAayRJ0AAAAA:7xMCEa5uv6-CF_yt-XK8rTqMXJf30BPyBHs5ZvrajGngvdyLDU-9ip3GBB5ItPnRaUE6SiHLdDqy" rel="noopener" class="external-link" href="https://www.sciencedirect.com/science/article/pii/S0145213420303951?casa_token=xhGbUAayRJ0AAAAA:7xMCEa5uv6-CF_yt-XK8rTqMXJf30BPyBHs5ZvrajGngvdyLDU-9ip3GBB5ItPnRaUE6SiHLdDqy" target="_blank">A spatiotemporal analysis of the impact of COVID-19 on child abuse and neglect in the city of Los Angeles, California - ScienceDirect</a></li>
<li data-line="2"><div class="list-bullet"></div>Let's take a look at the code book<br>
- There is an interesting field called mo which in the law stands for modus operandi<br>
<span alt="Pasted image 20230806170641.png" src="Pasted image 20230806170641.png" class="internal-embed media-embed image-embed is-loaded"><img alt="Pasted image 20230806170641.png" src="pasted-image-20230806170641.png"></span><br>
- Notice that this field possible has a ton of data that is all but ignored because people don't pay careful attention<br>
- Below I show you how I used R to create 10 additional fields of data from the mo field</li>
</ul></div><div><pre class="language-r" tabindex="0"><code class="language-r is-loaded">library<span class="token punctuation">(</span>RSocrata<span class="token punctuation">)</span>
base_url <span class="token operator">=</span> <span class="token string">"https://data.lacity.org/resource/2nrs-mtv8.json?"</span> <span class="token comment">#this is the dataset 2020 to present</span>
my_token <span class="token operator"><-</span> <span class="token string">"w0BkWUPZYzjQRwNEVX8KEijw4"</span>
lacity_data <span class="token operator"><-</span> read.socrata<span class="token punctuation">(</span>base_url<span class="token punctuation">,</span> my_token<span class="token punctuation">)</span>
glimpse<span class="token punctuation">(</span>lacity_data<span class="token punctuation">)</span>
ipv_crimes_in_la <span class="token operator"><-</span> lacity_data <span class="token percent-operator operator">%>%</span>
mutate<span class="token punctuation">(</span>vict_age <span class="token operator">=</span> as.numeric<span class="token punctuation">(</span>vict_age<span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token percent-operator operator">%>%</span>
filter<span class="token punctuation">(</span>str_detect<span class="token punctuation">(</span>crm_cd_desc<span class="token punctuation">,</span> <span class="token string">"INTIMATE PARTNER"</span><span class="token punctuation">)</span><span class="token punctuation">,</span>
str_detect<span class="token punctuation">(</span>premis_desc<span class="token punctuation">,</span> <span class="token string">"MOTORHOME|GROUP HOME|MOTEL|DWELLING|RESIDENTIAL|HOUSING"</span><span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token percent-operator operator">%>%</span>
mutate<span class="token punctuation">(</span>crm_cd_desc <span class="token operator">=</span> str_replace<span class="token punctuation">(</span>crm_cd_desc<span class="token punctuation">,</span> <span class="token string">".*INTIMATE PARTNER.*"</span><span class="token punctuation">,</span> <span class="token string">"IPV"</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token percent-operator operator">%>%</span>
select<span class="token punctuation">(</span><span class="token string">"dr_no"</span><span class="token punctuation">,</span> <span class="token string">"crm_cd_desc"</span><span class="token punctuation">,</span> <span class="token string">"date_occ"</span><span class="token punctuation">,</span> <span class="token string">"time_occ"</span><span class="token punctuation">,</span> <span class="token string">"lat"</span><span class="token punctuation">,</span> <span class="token string">"lon"</span><span class="token punctuation">,</span> <span class="token string">"mocodes"</span><span class="token punctuation">,</span><span class="token string">"area_name"</span><span class="token punctuation">,</span> <span class="token string">"vict_age"</span><span class="token punctuation">,</span> <span class="token string">"vict_sex"</span><span class="token punctuation">,</span> <span class="token string">"vict_descent"</span><span class="token punctuation">,</span> <span class="token string">"premis_desc"</span><span class="token punctuation">,</span> <span class="token string">"weapon_desc"</span><span class="token punctuation">,</span> <span class="token string">"status_desc"</span><span class="token punctuation">)</span>
glimpse<span class="token punctuation">(</span>ipv_crimes_in_la<span class="token punctuation">)</span>
ipv_crimes_in_la<span class="token operator">$</span>mo <span class="token operator"><-</span> ipv_crimes_in_la<span class="token operator">$</span>mocodes
ipv_crime_mo <span class="token operator"><-</span> separate<span class="token punctuation">(</span>data <span class="token operator">=</span> ipv_crimes_in_la<span class="token punctuation">,</span> col <span class="token operator">=</span> mocodes<span class="token punctuation">,</span> into <span class="token operator">=</span>
c<span class="token punctuation">(</span><span class="token string">"m1"</span><span class="token punctuation">,</span> <span class="token string">"m2"</span><span class="token punctuation">,</span> <span class="token string">"m3"</span><span class="token punctuation">,</span> <span class="token string">"m4"</span><span class="token punctuation">,</span> <span class="token string">"m5"</span><span class="token punctuation">,</span> <span class="token string">"m6"</span><span class="token punctuation">,</span> <span class="token string">"m7"</span><span class="token punctuation">,</span> <span class="token string">"m8"</span><span class="token punctuation">,</span> <span class="token string">"m9"</span><span class="token punctuation">,</span> <span class="token string">"m10"</span><span class="token punctuation">)</span><span class="token punctuation">,</span>
sep <span class="token operator">=</span> <span class="token string">" "</span><span class="token punctuation">)</span>
<span class="token comment">#makes all "m" variables numeric at once</span>
ipv_crime_mo<span class="token punctuation">[</span><span class="token punctuation">,</span><span class="token number">7</span><span class="token operator">:</span><span class="token number">16</span><span class="token punctuation">]</span> <span class="token operator"><-</span> sapply<span class="token punctuation">(</span>ipv_crime_mo<span class="token punctuation">[</span><span class="token punctuation">,</span><span class="token number">7</span><span class="token operator">:</span><span class="token number">16</span><span class="token punctuation">]</span><span class="token punctuation">,</span>as.numeric<span class="token punctuation">)</span>
glimpse<span class="token punctuation">(</span>ipv_crime_mo<span class="token punctuation">)</span>
tbl_lookup<span class="token operator"><-</span>read.csv<span class="token punctuation">(</span><span class="token string">"C:/Users/barboza-salerno.1/OneDrive - The Ohio State University/Desktop/Research/LA County/MO_CODES_Numerical_20180627.csv"</span><span class="token punctuation">)</span>
names<span class="token punctuation">(</span>tbl_lookup<span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span> <span class="token operator"><-</span> <span class="token string">"id"</span>
<span class="token keyword">for</span> <span class="token punctuation">(</span>i <span class="token keyword">in</span> <span class="token number">1</span><span class="token operator">:</span><span class="token number">10</span><span class="token punctuation">)</span><span class="token punctuation">{</span>
ipv_crime_mo<span class="token punctuation">[</span><span class="token punctuation">,</span><span class="token punctuation">(</span>length<span class="token punctuation">(</span>ipv_crime_mo<span class="token punctuation">)</span><span class="token operator">+</span><span class="token number">1</span><span class="token punctuation">)</span><span class="token punctuation">]</span> <span class="token operator">=</span> tbl_lookup<span class="token punctuation">[</span>match<span class="token punctuation">(</span>ipv_crime_mo<span class="token punctuation">[</span><span class="token punctuation">,</span><span class="token punctuation">(</span>i<span class="token operator">+</span><span class="token number">6</span><span class="token punctuation">)</span><span class="token punctuation">]</span><span class="token punctuation">,</span> tbl_lookup<span class="token operator">$</span>id<span class="token punctuation">)</span><span class="token punctuation">,</span> <span class="token string">"descript"</span><span class="token punctuation">]</span>
<span class="token punctuation">}</span>
library<span class="token punctuation">(</span>writexl<span class="token punctuation">)</span>
write_xlsx<span class="token punctuation">(</span>ipv_crime_mo<span class="token punctuation">,</span> <span class="token string">"ipv_crime_mo.xlsx"</span><span class="token punctuation">)</span>
</code><button class="copy-code-button">Copy</button></pre></div><div><ul class="has-list-bullet">
<li data-line="0"><div class="list-bullet"></div>
<p>First, we separate the mo column into multiple colums<br>
<span alt="Pasted image 20230806170317.png" src="Pasted image 20230806170317.png" class="internal-embed media-embed image-embed is-loaded"><img alt="Pasted image 20230806170317.png" src="pasted-image-20230806170317.png"></span></p>
</li>
<li data-line="3"><div class="list-bullet"></div>
<p>Then I create a lookup table based on this file of <a data-tooltip-position="top" aria-label="https://recordsrequest.lacity.org/documents/752789" rel="noopener" class="external-link" href="https://recordsrequest.lacity.org/documents/752789" target="_blank">mo codes and descriptions</a></p>
</li>
<li data-line="4"><div class="list-bullet"></div>
<p>Then I replaced the codes with the text in the data<br>
<span alt="Pasted image 20230806170004.png" src="Pasted image 20230806170004.png" class="internal-embed media-embed image-embed is-loaded"><img alt="Pasted image 20230806170004.png" src="pasted-image-20230806170004.png"></span></p>
</li>
<li data-line="7"><div class="list-bullet"></div>
<p>Now I can filter the data for crimes involving any number of modus operandi, for example, the code below filters all IPV crimes that involved "homosexuals"</p>
</li>
</ul></div><div><pre class="language-r" tabindex="0"><code class="language-r is-loaded">homosexual_data <span class="token operator"><-</span> ipv_crime_mo <span class="token percent-operator operator">%>%</span>
filter_at<span class="token punctuation">(</span>.vars <span class="token operator">=</span> vars<span class="token punctuation">(</span>V25<span class="token punctuation">,</span> V26<span class="token punctuation">,</span> V27<span class="token punctuation">,</span> V28<span class="token punctuation">,</span> V29<span class="token punctuation">,</span> V30<span class="token punctuation">,</span> V31<span class="token punctuation">,</span> V32<span class="token punctuation">,</span> V33<span class="token punctuation">,</span> V34<span class="token punctuation">)</span><span class="token punctuation">,</span>
.vars_predicate <span class="token operator">=</span> any_vars<span class="token punctuation">(</span>str_detect<span class="token punctuation">(</span>. <span class="token punctuation">,</span> <span class="token string">"Homo"</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
</code><button class="copy-code-button">Copy</button></pre></div><div class="mod-footer"><div class="embedded-backlinks" style="display: none;"></div></div></div></div></div><div class="sidebar-right sidebar"><div class="sidebar-content"><span class="sidebar-section-header">Interactive Graph</span><div class="graph-view-placeholder">
<div class="graph-view-container">
<div class="graph-icon graph-expand" role="button" aria-label="Expand" data-tooltip-position="top"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="svg-icon lucide-arrow-up-right"><line x1="7" y1="17" x2="17" y2="7"></line><polyline points="7 7 17 7 17 17"></polyline></svg></div>
<canvas id="graph-canvas" width="512px" height="512px"></canvas>
</div>
</div><div class="tree-container outline-tree" data-depth="0"><div class="tree-header"><span class="sidebar-section-header">Table Of Contents</span><button class="clickable-icon collapse-tree-button"><iconify-icon icon="ph:arrows-in-line-horizontal-bold" width="18px" height="18px" rotate="90deg" color="currentColor"></iconify-icon></button></div><div class="tree-scroll-area"><div class="tree-item mod-tree-heading" data-depth="1"><div class="tree-item-contents"><a class="tree-item-link" href="#Introduction"><span class="tree-item-title">Introduction</span></a></div><div class="tree-item-children"><div class="tree-item mod-tree-heading" data-depth="4"><div class="tree-item-contents"><a class="tree-item-link" href="#Let's_go_back_to_the_crime_data_from_the_city_of_Los_Angeles"><span class="tree-item-title">Let's go back to the crime data from the city of Los Angeles</span></a></div><div class="tree-item-children"></div></div></div></div></div></div></div></div></div></body></html>