-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathindex.html
804 lines (607 loc) · 44.4 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
<!DOCTYPE html>
<html lang="" xml:lang="">
<head>
<title>Making Data Pipelines in R:</title>
<meta charset="utf-8" />
<meta name="author" content="Meghan Harris, MPH" />
<script src="libs/header-attrs/header-attrs.js"></script>
<script src="libs/freezeframe/freezeframe.min.js"></script>
<script src="libs/xaringanExtra-freezeframe/freezeframe-init.js"></script>
<script id="xaringanExtra-freezeframe-options" type="application/json">{"selector":"img[src$=\"gif\"]","trigger":"click","overlay":false,"responsive":false,"warnings":true}</script>
<link href="libs/xaringanExtra-extra-styles/xaringanExtra-extra-styles.css" rel="stylesheet" />
<link rel="stylesheet" href="scripts/style.css" type="text/css" />
<link rel="stylesheet" href="scripts/animation.css" type="text/css" />
</head>
<body>
<textarea id="source">
class: center, middle, inverse, title-slide
# Making Data Pipelines in R:
## A Story From A ‘Self-Taught’ Perspective
### Meghan Harris, MPH
### RStudio::conf(2022) | Washington D.C, U.S
### 27/07/2022 - July 27th, 2022
---
class: dark, center, middle, animated, fadeIn
<img src= "images/initial_dream.png", alt="A abstract piece of generative art that looks like a tangled ball of black string with various lighter colors mixed in. Some may even say it looks like a messy black hole." width = "100%" height = "100%">
???
So I have a story to tell you all about making data pipelines in R, but before I get into that, I have a question to ask all of you. You don't have to answer me out loud, but...
<br><br><br>
When you see this image... Does it make you feel anything? Any ideas on what it is?
<br><br>
Well, this is my personal, 100% accurate representation of a data pipeline. Now you're probably thinking I'm out of my mind, and I might be, but hear me out. Of course this isn't an actual data pipeline, but rather the mental image and feelings I had in my last position as a Data specialist when I was tasked with creating a data pipeline from scratch.
<br><br>
The story I want to tell you all today is about how I created that pipeline in R within the context of me being a self-taught R programmer in my first professional data position.
<br><br>
Those of us that work in data, when we think of data pipelines, it might look something more like this...
---
class: dark, center, middle, animated, fadeIn
<img src= "images/perfectpipe.png" alt="A visual example of a 'perfect' pipeline. A process that starts with perfect, contained data collection, perfect cleaning processing, perfect cloud data storage, an easy way to pull data into software and ultimately share it with others easily." width = "100%" height = "100%">
???
The "Perfect" Data Pipeline:
Nice clear processes, clean structures, with great results
<br>
<br>
This is a pipeline to most of us, but in this last position I was in, the pipeline looked nothing like that at all. If we could say there was any pipeline, it was very manual, very clunky, no structure, no automation, just vibes and recklessness. So although I learned a lot and have alot to share, I don't have a lot of time to talk today, so I pulled out four main points that I learned about creating pipelines during this journey.
---
class: light, animated, fadeIn
<h1><span>Main Points</span></h1>
<br><br>
<center><img src= "images/mainpoints.png" alt="A roadmap visual of the presentation flow. It display four sections of the presentation. The first encounters with the data, the environmental structures of the pipeline, the integration of data validation into the pipeline, and creating and identifying sustainability efforts for the data pipeline." width = "100%"></center>
???
-My first encounters with the data in this pipeline<br>
-The importance of identifying the environmental structures of the data pipeline<br>
-Embedding validations within the data pipeline<br>
-Understanding what sustainability looks like for the data pipeline
-Keep in mind that all of these main points could easily be their own talks so I'll only touch on the biggest takeaways. However, I've created a list of example pipeline scripts and resources that show some of these concepts in actions that you'll be able to access and use after this talk.
---
class: light, animated, fadeIn
<h1><span>Main Points</span></h1>
<br><br>
<center><img src= "images/mp_1.png" alt = "The presentation's roadmap with the first main point, first encounters, highlighted." width = "100%"></center>
???
So, with that out of the way, I think the best thing for me to do is go back to the beginning, during those first encounters I had with this pipeline's data within this position.
<br><br>
Now I know, I'm going to sound like I'm out of my mind again, but, the very first thing I learned during my first encounters in this journey was that...
---
class: light, animated, fadeIn
<h1><span>First Encounters</span></h1>
<center><br><br><br>
<p><i><font size = "48px">"The first step to building a data pipeline in R, <font color = "#A21152">is to <b>NOT</b> open R.</font>"</font></i></p>
</center>
???
sounds counter-intuitive...<br>
Let me give you some context..
---
class: light, animated, fadeIn
<h1><span>First Encounters</span></h1>
<img src="images/myself.png" alt="a picture of Meghan Harris on her first day of work as a Data Specialist" width="25%" height="25%" style="float: left; margin-top:100px;">
<p style="float: right;"><h2><b>Some Facts:</b></h2>
<ul><li>Unfamiliar subject matter - opioid data</li><br>
<li>A lot of data existed everywhere</li><br>
<li>Organization did not have any pipelines</li><br>
<li>Organization's expectations weren't clear</li><br>
<li>No direction was given...</li></ul></p>
???
Some facts about how it was starting in this position...
- Not familiar with things<br>
- Data was everywhere<br>
- Organization didn't have pipelines, or expectations<br>
- thus -> No direction given...what's a newbie like myself to do?
---
class: light, animated, fadeIn
<h1><span>First Encounters</span></h1>
<h2><b>So What Did I Do?</b></h2>
<center>
<img class = "gif"; src="images/headfirst.gif" alt="A looping animated gif of Shaun from the Shaun of the Dead movie jumping over a fence to get to the Winchester." width = "400px", height = "300px">
</center>
???
Anyone seen Shaun of the Dead?<br>
Just like Shaun dove head first here on is way to the Winchester...I Jumped into RStudio head-first
---
class: light, animated, fadeIn
<h1><span>First Encounters</span></h1>
<h3><b>What I did vs. <font color = "#A21152"><i>What I Should Have Done</i></font></b></h3>
<center><br><br><br>
<b>I tried to create R programs immediately.</b>
<br><br>
--
<b><i><font color = "#A21152">I should have started investigating instead.</font></i></b><br><br>
</center>
???
I argue this point with an real-life example of something that happened to me at work<br>
---
class: light, animated, fadeIn
<h1><span>First Encounters</span></h1>
<h2><b>The Narcan Story</b></h2>
<center><img src= "images/narcan.png" alt="A picture of a nasal spray medication called Narcan. Narcan is used to reverse opioid overdoses." height = "200px"><br><br>
--
<b><i><font color = "#A21152">0,1,2,3 are numbers...right?</font><font color = "#5c0e0b"> <br>🚨"4+" is not...🚨</font></i></b><br><br>
</center>
???
Narcan is a medication that reverses opioid overdoses.
<br>
Got a data set with a variable called "Narcan" in it <br>
Opened RStudio IDE, visually looked at the variable <br>
Saw Numbers, so just like the Shaun a couple of slides ago <br>
... I jumped in head first <br>
...and proceeded to create reports and dashboards that included calculations of this Narcan variable...<br>
Two months later, I had a random "Eureka" moment and decided to check the Narcan Variable...<br>
I found a value of "4+" *SLIDE UP* ...which is not a number<br>
Imagine the embarrassment of going back and telling everyone that everything I did for two months was wrong
---
class: light, animated, fadeIn
<h1><span>First Encounters</span></h1>
<img src="images/patches.png" alt="A picture of a black dog laying on a bed and partially covering an open laptop." width="25%"; height="25%"; style="float: left; margin-top:100px;">
<p style="float: right;"><h3><b>What I learned about First Encounters:</b></h3>
<ul><li>Don't do any "real" coding...yet</li><br>
<li>Ask for code books</li><br>
<li>Understand the data's environment</li><br>
<li>Understand the organization's environment</li><br>
<li>Try to start creating "metadata"</li></ul>
???
So I learned alot clearly as seen here, but the main thing is to NOT jump into RStudio as soon as you get a dataset.<br>
If you do jump to coding, do the bare minimum to give you clues/info you need to start asking questions about the data and policies/processes in the organization <br> You can also document this early process by making a rough version of metadata. Where you track where data is coming from, what's in the data, who handles the data, etc.
<br>
A hard lesson I learned was that taking the time to take a step back and go through an iterative process of asking questions can give a bit more clarity and focus as you begin to start creating a data pipeline. <br>This is more important than just rushing to open RStudio because if you jump in with no information, head first, that pipeline can look like that first image I showed, but if we add clarity by taking the time to investigate...
---
class: dark, center, middle, animated, fadeIn
<img src= "images/pl_2.png", alt="The second version of the abstract art that represents a data pipeline when data investigations are performed. The image still looks like a tangled mess, but with the hole in the center of it widening a bit." width = "100%", height = "100%">
???
we'll find that things are still messy, this IS a different picture than the first if you couldn't tell. It might not seem like much changed here but that small change can be just enough to help you refine and building processes.<b>
Once you accept the fact that you need to repeatedly do this process it's safe to start building out the foundation of the pipeline...
---
class: light, animated, fadeIn
<h1><span>Main Points</span></h1>
<br><br>
<center><img src= "images/mp_2.png", alt="The presentation's roadmap now with the second main point, Environmental Structures, highlighted." width = "100%"></center>
???
by introducing actual environmental structure to the process ..<br>
The biggest thing I learned about environmental structures is that...
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<center><br><br><br>
<p><i><font size = "48px">"Data pipelines can exist in two environments that co-exist together; <font color = "#0B82B9"> <b>external</b> and <b>internal</b> environments.</font>"</font></i></p>
</center>
???
probably more sophisticated theories and ways to break this concept down even further, but being that it was my first time building a pipeline, it was more digestible for me to think about this concept as simply stating that a data pipeline has two environments that coexist and build off of each other.
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<center><img src= "images/internal_ext.png" alt="an image that shows visuals of components of an pipelines potential internal and external structures With the external on the left side and internal on the right" height = 450px></center>
???
So when starting out, I realized I had external things to worry about outside of my computer<br><br>and internal things to worry about inside of my computer. To further explain both of these...
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<img src="images/external.png" alt="A simple diagram that shows a double arrow between two individuals and a double arrow between two buildings" width="25%"; height="25%"; style="float: left; margin-top:100px;">
<p style="float: right;"><h2><b>External Environment</b></h2>
<ul><li>Data interactions, decisions, and processes that exist <b>externally</b> to your system.</li><br>
<li>Administrative policies that affect the flow of data</li><br>
<li>Technology that is used for data collection outside of R</li><br>
<li>Data literacy of stakeholders and co-workers</li></ul></p><br>
???
In my xp, it was easier to ID the external environment first as this overlapped with the investigation process mentioned earlier. More data sources you have, more involved the process is. Knowing all I could about the external environment, helped guide what the structure would look like internally.
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<img src="images/internal.png" alt="A picture that shows a simple diagram of data flowing in between folders and into data visualizations." width="25%"; height="25%"; style="float: left; margin-top:100px;">
<p style="float: right;"><h2><b>Internal Environment</b></h2>
<ul><li>Actual file structure and data flow <b>internally</b> on your system</li><br>
<li>Flow of logic for data processing, analysis, and deliverable creation/deployment</li><br>
<li>Method of data security and storage processes</li></ul></p>
???
The internal environment is what most people think about when they think of data pipeline. The internal environment is essential. Because it deals with...<br><br>
-Understanding the best way to set up your logic in things<br> nuances of any packages that you are, -Understanding what R packages are essential for minimum functionality<br>
-Understanding the output needed for deliverables.<br><br>
Now, it should go without saying that this is not the reality...
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<center><img src= "images/interactions.png" alt="The original picture of the external investigation visuals on the left, and the internal investigation visuals on the right now with the addition of multiple arrows crossing over into each environment on the left and right sides to visually make the point that environments in a data pipeline are seldom siloed or exist on their own." height = 450px></center>
???
These environments don't exist in silos. Clearly external things can affect internal things and vice versa...<br>
This is a challenge that every data person knows well. Even if you don't think about the data's environment in this way, those of you that work in non-profit, academia, or government can probably relate to feeling like some thing in your work is affected by bureaucracy or conflicting interests/decision..<br><br>
What helped me to better navigate the interactions between the two environments was to repeatedly tackle each environment by introducing structure, documentation, and workflow processes into one environment and adjusting for the other environment as needed. You can try your best to account for all of this things at the beginning, but it's hard to know "what" will influence "what" every time.
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<h5><b>Examples of Structuring the External Environment</b></h5>
<center><b>Making Metadata</b>
<br>
<img src="images/metaex.png" alt="A screenshot of a metadata table that comes from a datasets codebook. Metadata can consist of data about the data with information such as file types variable types administrators methods of collection etc" width = "50%" height = "50%"></center>
???
This is a snippet of a codebook's metadata section I've created in the past. The specifics here aren't important and there's example documents like this in the repo you can use for yourself. But the main thing that helped me was having literal data about the data...
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<h5><b>Examples of Structuring the External Environment</b></h5>
<center><b>"Assess, Attempt, Repeat"</b>
<br>
<img src="images/aar.png" alt="A visual with a series of gears on the left and two people talking on the right. This visualizes the concept of assessing what is or isn't working in a data pipeline externally, attempting to bring improvements, and repeating the process until a satisfactory resolution is achieved." width = "45%", height = "40%"></center>
???
If you're coming into workplaces that already have established way of doing things, and they are inefficient, as a new person, you have the benefit of seeing things with a fresh eye. You can always attempt to make the changes to make things better, sometimes it can be successful, sometimes it may not. If it's not successful, you can always try again by repeating to solve any problems in a different way. Whether that's changing something externally, or compromising and changing something internally.<br><br>
And Speaking of Internally, I learned a few things that helped me along the way as well.
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<h5><b>Examples of Structuring the Internal Environment</b></h5>
<center><b>Make Clean File Structures</b><br>
<img src="images/filestructure.png" alt="A screenshot of an RStudio IDE's Files pane that shows a clean file directory. Clean file directories will have established folders for certain file types, data, objects, and components needed for a data pipeline in R" width = "70%"></center>
???
-Always create an R Project file/set up for each deliverable<br><br>
It's always helped me to break files up by type (data/script)<br>
all data has a home, all scripts, and images, etc. has a home.<br>
Can look different for you based on your org situation.
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<h5><b>Examples of Structuring the Internal Environment</b></h5>
<center><b>Modularize Your Code (By Data/Component)</b><br>
<img src="images/modularizedcode.png" alt="A screenshot of an RStudio IDE's Files pane that shows a clean file directory with example of R script files that were modularized by data set. This can work well in pipelines with many data sources." width = "80%"></center>
???
You can modularize it by component<br>
Naloxtrac - Tracking dashboard that shows aggregate ODs and public Narcan locations<br>
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<h5><b>Examples of Structuring the Internal Environment</b></h5>
<center><b>Modularize Your Code (By Function/Process)</b><br>
<img src="images/modularizedcode2.png", alt="A screenshot of an RStudio IDE's Files pane that shows a clean file directory with example of R script files that were modularized by function. This can work well in pipelines with only one or few data sources." width = "100%"></center>
???
Can modularize it by function - especially if you're only working with fewer data sources
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<h5><b>Examples of Structuring the Internal Environment</b></h5>
<center><b>Connect and Chain Where It Makes Sense</b></center>
```r
#==============================#
# Library Load-in====
#==============================#
library(tidyverse) # For everything data
library(googlesheets4) # For interacting with Google Drive
library(janitor) # To keep things clean
#==============================#
```
???
The last thing I learned about the internal envrionment of pipelines in R is the benefit of something that I've called "connecting and chaning"<br><br>
The context of the next few code snippets aren't important, but the structure of the scripts and how it's organized and used is.<br>
The following snippets are apart of an example "docking script" - one R script that controls the execution of an entire pipeline in one script by using logic to launch and execute other scripts. - We start with a simple library load-in...
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<h5><b>Examples of Structuring the Internal Environment</b></h5>
```r
#==============================#
# Data Process Checks====
#==============================#
data_updated <- FALSE
#Data Load in#
Mainspreadsheet <- "Google Link to Spreadsheet HERE"
invisible(readline(prompt=c(gs4_auth(), "Press ENTER to confirm selection")))
#Spreadsheet being pulled into the environment#
new_data <- read_sheet(Mainspreadsheet, sheet = "Version 1" ) %>%
select(c(2:4,6,7))
#Pulling in "Master" file to scan for any changes#
master_data <- readRDS("data/master_data.rds")
#Data "Validation"#
data_updated <- nrow(new_data) != nrow(master_data)
```
???
Then we move into loading in data and doing work to check, or validate, that data that comes in. The data can come from many different sources of course, this example is pulling from a google sheet with the Googlesheets4 package.
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<h5><b>Examples of Structuring the Internal Environment</b></h5>
```r
#==============================#
#Logic Flow for Processing====
#==============================#
# If data has changed, a separate processing program is launched.#
# And a new master file is saved once it completes.#
if(data_updated){
source("scripts/processing_script.R")
}
# Pipeline returns to the dock and launches a new script to make visuals
# Only if the data was updated#
#Visual Creation#
if(data_updated){
source("scripts/viz_script.R")
}
```
???
Next we have example of logic control flows. This section uses logic variables from previous section to determine if the pipeline should launch additional scripts.<br><br>
In this specific example, ONLY if the data was updated it will go ahead and launch additional data processing and visualization scripts. If the data wasn't updated, these additional scripts are not launched, thus saving some computer power and time.
---
class: light, animated, fadeIn
<h1><span>Environmental Structures</span></h1>
<h5><b>Examples of Structuring the Internal Environment</b></h5>
```r
#==============================#
#Rendering of Deliverable====
#==============================#
# Pipeline returns to the dock and renders the associated Markdown report
rmarkdown::render("reports/example_report.Rmd",
output_file = "example_report.html")
```
???
The last part of the docking script is the rendering, or execution of the final output. In this example, the final deliverable is an R Markdown document. This can be changed to the deployment of a shiny app, or even just saving data out of R for use in another program like Power BI or Tableau.<br><br>
So now that we've added some environmental structures, our piepline looks like this....
---
class: dark, center, middle, animated, fadeIn
<img src= "images/pl_3.png" alt="The third version of the abstract art that represents a data pipeline when data investigations and environmental structures are added. The image now looks a bit cleaner and the tangled mess appears to start straightened out into some type of pattern. Some aspects of the image are still messy with the hole in the center of it widening even more." width = "100%" height = "100%">
???
Which takes us to the third main point...
---
class: light, animated, fadeIn
<h1><span>Main Points</span></h1>
<br><br>
<center><img src= "images/mp_3.png" alt="The presentation's roadmap now with the third main point, Data Validations, highlighted." width = "100%"></center>
???
Data validation....<br> So, I feel like I'm exposing myself here, but the biggest thing I learned about validations were.... to do them<br>
---
class: light, animated, fadeIn
<h1><span>Validations</span></h1>
<center><br><br><br>
<p><i><font size = "48px"><font color = "#612884"><b>"Check</b> yourself</font>, before you wreck yourself"</i><br>😭</font></p>
</center>
???
Because I WAS NOT checking myself, and as a result, I was indeed WRECKING myself. <br>
I still have a lot to learn myself about performing data validations efficiently. So I'm not going to try and act like I'm an expert, but I can at least share the little bit that I did learn.
---
class: light, animated, fadeIn
<h1><span>Validations</span></h1>
<h2><b>The Narcan Story</b></h2>
<img src="images/narcan.png" alt="A picture of a nasal spray medication called Narcan. Narcan is used to reverse opioid overdoses.", height="200px"; style="float: left; margin-top:100px;"><center><br>
<font color = "#FF1616"><b>Pain<b></font> could have been avoided if I <b>validated</b> that "Narcan" variable.<br>
Assumed all the values were `numeric` types when they were really `character` types<br><br>
This situation needed a <b>variable "Type" check</b></center>
???
Validating and just properly checking the data is important because it gives us more confidence that we are handling the type of data that we are expecting.
---
class: light, animated, fadeIn
<h1><span>Validations</span></h1>
<h5><b>Examples of Validations Types I've personally used:</b></h5>
<br><center>
<img src= "images/validationtypes.png" alt="An image that shows 8 icons that represents 8 different types of validation types. A variable type, allowed or expected values, missing data, logic checks, uniqueness, string length, formatting, and range checks. This is not an exhaustive list, rather what Meghan Harris has personally dealt with at her previous job." width = "85%"></center>
???
These are some types I personally dealt with.
My point of showing this is to get the gears turning for you if you're tasked with the same thing. To just give you some ideas about validations you can check for if they apply to your situation.
---
class: light, animated, fadeIn
<h1><span>Validations</span></h1>
Validations can <b><font color = "#612884">exist in external and internal environments</font></b><br>
<b>Best scenario</b>: Validations are implemented <b><font color = "#612884">externally</font></b><br>
<center>
<img src="images/extvalidation.png" alt="A visual that shows the flow of data from an external environment to an internal environment when data validation is implemented externally before it reaches R or RStudio in the internal environment. It is proposed that this method of validation would hopefully be less work for the programmer as they are receiving cleaner data internally." width = "85%"></center>
---
class: light, animated, fadeIn
<h1><span>Validations</span></h1>
Validations can <b><font color = "#612884">exist in external and internal environments</font></b><br>
<b>Usual scenario</b>: Programmer has to implement it <b><font color = "#612884">internally</font></b>
<center><img src="images/intvalidation.png" alt="A visual that shows the flow of data from an external environment to an internal environment when data validation is implemented internally within R or RStudio in the internal environment. It is proposed that this method of validation would possibly be more work for the programmer as they are receiving dirtier data internally that needs to be cleaned and validated." width = "85%"></center>
---
class: light, animated, fadeIn
<h1><span>Validations</span></h1>
I did all of my validations manually in R...but there's a package for that!
<br><br>
[The `pointblank` Package](https://github.com/rich-iannone/pointblank)<br>
<p style ="font-size:20px">Iannone R, Vargas M (2022). pointblank: <i>Data Validation and Organization of Metadata for Local and Remote Tables</i>. https://rich-iannone.github.io/pointblank/, <a href="https://github.com/rich-iannone/pointblank"</a></p>
[The `validate` Package]("https://github.com/data-cleaning/validate")<br>
<p style ="font-size:20px">MPJ van der Loo and E de Jonge (2020). Data Validation Infrastructure for R. <i>Journal of Statistical Software</i>, Accepted for publication. <a href="https://arxiv.org/abs/1912.09759">https://arxiv.org/abs/1912.09759</a></p>
<br>
???
Nothing wrong with doing it manually<br>
Can be cumbersome (depending on the problem)<br>
although there's packages to help, depending on how complicated the data is, you may have to figure out some things manually. It may take longer, but you'll learn more about your data that way, so it's not all bad.<br><br>
So now that I'm late to the party and figured out I need some type of data validation, the pipeline is now looking like this...
---
class: dark, center, middle, animated, fadeIn
<img src= "images/pl_4.png" alt="The fourth version of the abstract art that represents a data pipeline when data investigations, environmental structures, and data validations are added. The image now looks even more cleaner and the tangled mess is even more straightened out to start showing a type of kaleidoscopic effect with internal triangular shapes overlayed on top of one another. Although shapes can now be distinguished, the lines of the image are still a little messy with the hole in the center of it being defined even more." width = "100%" height = "100%">
???
adding validation
---
class: light, animated, fadeIn
<h1><span>Main Points</span></h1>
<br><br>
<center><img src= "images/mp_4.png", width = "100%"></center>
???
So the last main point I learned was about sustainability efforts...
---
class: light, animated, fadeIn
<h1><span>Sustainability</span></h1>
<center><br><br><br>
<p><i><font size = "48px">"Sustainability" can mean and look like <font color = "#488E35"><b>a lot of different things</i></font></p>
</center>
???
Def: Capability of something that can be maintained at length without interruption or weakening.<br>
I didn't get to do much work for sustainability because of my the organization's environment<br>
I was the data team/programmer/analyst/data viz designer<br>
There was no time for comprehensive sustainability work - This is not uncommon.
<br>
Although it can look different, in my xp, there's a few main examples I can think of that I would have liked to implement in my last position.
---
class: light, animated, fadeIn
<h1><span>Sustainability</span></h1>
<h5><b>Examples of Sustainability Efforts:</b></h5>
- Data Centralization
- <font color = "#488E35">Data Workflow Documentation</font> 🌟
- <font color = "#488E35">Codebooks</font> 🌟
- <font color = "#488E35">Automated Docking Scripts </font>🌟
- Adopted Data Standardization/Validations
- Scheduled Code Reviews
- Pipeline QA and QI
- <font color = "#488E35">Operational Documentation🌟</font>
- Making Scalable Code
???
Patience is needed here: Although talking about the sustain. of the pipeline<br>
So much sustainability work involves the external environment<br>
Sustainability is hard..a lot of times, data teams may not have the bandwidth to take this on<br>
Centralization - Data is stored into one place (warehouse) but is accessible from multiple projects/scripts<br>
<br>
Planning for sustainability
---
class: light, animated, fadeIn
<h1><span>Sustainability</span></h1>
<center><img src="images/workflowex.png" alt = "An image that shows an example of a document that can be used to create pipeline sustainability by identifying, visualizing, and documenting the flow of data from outside of R and RStudio, to inside of it through processing and analyses, and out to the data's respective deliverable. Each data source is identified and marked on the top of the chart. The data processing that visualizes each modularized script and shows the order and flow the the data in the process." height = "450px"></center>
???
This is an example of one of the largest pipelines I created in R. The specifics aren't important here. The main takeaway is how helpful it can be to visualize the actual data workflow in the pipeline. Including information about the data, where it comes from, what scripts it feeds into, and what deliverables/outputs are expected upon completion of the workflow. This helps with sustainability because it allows the pipeline itself to visually represented and explained. SO not only can other people hopefully understand how its working, but the hope is that you can also review it and use it for quality improvement.
---
class: light, animated, fadeIn
<h1><span>Sustainability</span></h1>
<center><img src="images/datamapex.png" alt = "An example of a preliminary data map that was used for opioid data. This visual shows a simple way to map different related data sources together in a simple schema. Each box represents a data source, with the variables listed in it. Solid lines connect data sources that have known matching primary keys, or variables that can be used to easily link a relationship between the data sources, while dotted lines represent potential data that can be used to link different data sources." height = "450px"></center>
???
This is a more granular visual representation of a dataset where we see a human-readable description of what's in it, but also potential primary keys - or common variables that can be used to create a relationship between the dataset. Pretty much like a database schema we'd see in SQL or other similar scenarios.
<br>
OK, so now that I've zoomed by all of that, let's recap...
---
class: dark, center, middle, animated, fadeIn
<img src= "images/initial_dream.png" alt="A abstract piece of generative art that looks like a tangled ball of black string with various lighter colors mixed in. Some may even say it looks like a messy black hole." width = "100%" height = "100%">
???
This is what the pipeline looks like if we jump in head first
---
class: dark, center, middle, animated, fadeIn
<img src= "images/pl_2.png" alt="The second version of the abstract art that represents a data pipeline when data investigations are performed. The image still looks like a tangled mess, but with the hole in the center of it widening a bit." width = "100%" height = "100%">
???
When we start investigating the data and environment...
---
class: dark, center, middle, animated, fadeIn
<img src= "images/pl_3.png" alt="The third version of the abstract art that represents a data pipeline when data investigations and environmental structures are added. The image now looks a bit cleaner and the tangled mess appears to start straightened out into some type of pattern. Some aspects of the image are still messy with the hole in the center of it widening even more." width = "100%" height = "100%">
???
when we add structures to the external and internal environment...
---
class: dark, center, middle, animated, fadeIn
<img src= "images/pl_4.png" alt="The fourth version of the abstract art that represents a data pipeline when data investigations, environmental structures, and data validations are added. The image now looks even more cleaner and the tangled mess is even more straightened out to start showing a type of kaleidoscopic effect with internal triangular shapes overlayed on top of one another. Although shapes can now be distinguished, the lines of the image are still a little messy with the hole in the center of it being defined even more." width = "100%" height = "100%">
???
when we add data checks and validations
---
class: dark, center, middle, animated, fadeIn
<img src= "images/pl_5.png" alt="The fifth and final version of the abstract art that represents a data pipeline when data investigations, environmental structures, data validations, and sustainability efforts are added. The image now looks very clear. While it is not none what it is, one could say the image is pretty and orderly and no longer a tangled mess. Although it's hard to really know what this image is, enough clarity is given to appreciate that the components seem to go well together. This is representative of most data pipelines that exist in the real world. Most can be so large, or so complex, but with enough clarity, you can get something functional." width = "100%" height = "100%">
???
and finally, this is what the pipeline can look like after we implement and maintain some sustainability efforts for the pipeline...<br><br>
And you know, I still don't know what this is a picture of, but I know that it looks pretty (to me anyway) and that it has some clear shapes, straight lines, and colors. But maybe that's the point, right?<br><br>
If you are someone that is just starting out in data science, or teaching yourself how to use R like I was in this situation, something that you will, or may have learned is that having things be just "functional",can be enough. And sometimes, you can't possibly know everything about the data you are dealing with all of the time, but you can still appreciate the unknowns and the efforts that have been made to create the pipeline you have to work with.<br><br>
Of course, through time, we all want to evolve and use best practices and feel that we are 100% knowledgeable in what we do, but my hope is that I've given you guys something useful to take back with you today, to try and make this process of making data pipelines a little less painful, especially if you're going to embark on the same journey I did of making them from scratch in R. <br><br>
Thank you for listening.
</textarea>
<style data-target="print-only">@media screen {.remark-slide-container{display:block;}.remark-slide-scaler{box-shadow:none;}}</style>
<script src="https://remarkjs.com/downloads/remark-latest.min.js"></script>
<script>var slideshow = remark.create({
"ratio": "16:9",
"highlightStyle": "github",
"highlightLines": true,
"countIncrementalSlides": true
});
if (window.HTMLWidgets) slideshow.on('afterShowSlide', function (slide) {
window.dispatchEvent(new Event('resize'));
});
(function(d) {
var s = d.createElement("style"), r = d.querySelector(".remark-slide-scaler");
if (!r) return;
s.type = "text/css"; s.innerHTML = "@page {size: " + r.style.width + " " + r.style.height +"; }";
d.head.appendChild(s);
})(document);
(function(d) {
var el = d.getElementsByClassName("remark-slides-area");
if (!el) return;
var slide, slides = slideshow.getSlides(), els = el[0].children;
for (var i = 1; i < slides.length; i++) {
slide = slides[i];
if (slide.properties.continued === "true" || slide.properties.count === "false") {
els[i - 1].className += ' has-continuation';
}
}
var s = d.createElement("style");
s.type = "text/css"; s.innerHTML = "@media print { .has-continuation { display: none; } }";
d.head.appendChild(s);
})(document);
// delete the temporary CSS (for displaying all slides initially) when the user
// starts to view slides
(function() {
var deleted = false;
slideshow.on('beforeShowSlide', function(slide) {
if (deleted) return;
var sheets = document.styleSheets, node;
for (var i = 0; i < sheets.length; i++) {
node = sheets[i].ownerNode;
if (node.dataset["target"] !== "print-only") continue;
node.parentNode.removeChild(node);
}
deleted = true;
});
})();
(function() {
"use strict"
// Replace <script> tags in slides area to make them executable
var scripts = document.querySelectorAll(
'.remark-slides-area .remark-slide-container script'
);
if (!scripts.length) return;
for (var i = 0; i < scripts.length; i++) {
var s = document.createElement('script');
var code = document.createTextNode(scripts[i].textContent);
s.appendChild(code);
var scriptAttrs = scripts[i].attributes;
for (var j = 0; j < scriptAttrs.length; j++) {
s.setAttribute(scriptAttrs[j].name, scriptAttrs[j].value);
}
scripts[i].parentElement.replaceChild(s, scripts[i]);
}
})();
(function() {
var links = document.getElementsByTagName('a');
for (var i = 0; i < links.length; i++) {
if (/^(https?:)?\/\//.test(links[i].getAttribute('href'))) {
links[i].target = '_blank';
}
}
})();
// adds .remark-code-has-line-highlighted class to <pre> parent elements
// of code chunks containing highlighted lines with class .remark-code-line-highlighted
(function(d) {
const hlines = d.querySelectorAll('.remark-code-line-highlighted');
const preParents = [];
const findPreParent = function(line, p = 0) {
if (p > 1) return null; // traverse up no further than grandparent
const el = line.parentElement;
return el.tagName === "PRE" ? el : findPreParent(el, ++p);
};
for (let line of hlines) {
let pre = findPreParent(line);
if (pre && !preParents.includes(pre)) preParents.push(pre);
}
preParents.forEach(p => p.classList.add("remark-code-has-line-highlighted"));
})(document);</script>
<script>
slideshow._releaseMath = function(el) {
var i, text, code, codes = el.getElementsByTagName('code');
for (i = 0; i < codes.length;) {
code = codes[i];
if (code.parentNode.tagName !== 'PRE' && code.childElementCount === 0) {
text = code.textContent;
if (/^\\\((.|\s)+\\\)$/.test(text) || /^\\\[(.|\s)+\\\]$/.test(text) ||
/^\$\$(.|\s)+\$\$$/.test(text) ||
/^\\begin\{([^}]+)\}(.|\s)+\\end\{[^}]+\}$/.test(text)) {
code.outerHTML = code.innerHTML; // remove <code></code>
continue;
}
}
i++;
}
};
slideshow._releaseMath(document);
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement('script');
script.type = 'text/javascript';
script.src = 'https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML';
if (location.protocol !== 'file:' && /^https?:/.test(script.src))
script.src = script.src.replace(/^https?:/, '');
document.getElementsByTagName('head')[0].appendChild(script);
})();
</script>
</body>
</html>