-
Notifications
You must be signed in to change notification settings - Fork 120
/
Copy pathc02_Starting_with_R_and_data.Rmd
1037 lines (670 loc) · 32 KB
/
c02_Starting_with_R_and_data.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
output: github_document
---
00003_informalexample_2.1_of_section_2.1.2.R
```{r 00003_informalexample_2.1_of_section_2.1.2.R }
# informalexample 2.1 of section 2.1.2
# (informalexample 2.1 of section 2.1.2) : Starting with R and data : Starting with R : R programming
print(seq_len(25))
# [1] 1 2 3 4 5 6 7 8 9 10 11 12
# [13] 13 14 15 16 17 18 19 20 21 22 23 24
# [25] 25
```
00004_informalexample_2.2_of_section_2.1.2.R
```{r 00004_informalexample_2.2_of_section_2.1.2.R }
# informalexample 2.2 of section 2.1.2
# (informalexample 2.2 of section 2.1.2) : Starting with R and data : Starting with R : R programming
example_vector <- c(10, 20, 30) # Note: 1
example_list <- list(a = 10, b = 20, c = 30) # Note: 2
example_vector[1] # Note: 3
## [1] 10
example_list[1]
## $a
## [1] 10
example_vector[[2]] # Note: 4
## [1] 20
example_list[[2]]
## [1] 20
example_vector[c(FALSE, TRUE, TRUE)] # Note: 5
## [1] 20 30
example_list[c(FALSE, TRUE, TRUE)]
## $b
## [1] 20
##
## $c
## [1] 30
example_list$b # Note: 6
## [1] 20
example_list[["b"]]
## [1] 20
# Note 1:
# Build an example vector. c() is R’s concatenate operator—it builds longer vectors and lists
# from shorter ones without nesting. For example
# c(1) is just the number 1, and c(1, c(2, 3)) is
# equivalent to c(1, 2, 3), which in turn is the
# integers 1 through 3 (though stored in a floating
# point format).
# Note 2:
# Build an example list.
# Note 3:
# Demonstrate vector and list use of []. Notice for the list [] returns a new short list, not the item.
# Note 4:
# Demonstrate vector and list use of [[]]. In common cases [[]] forces a single item to be returned, though for nested lists of complex type this item itself could be a list.
# Note 5:
# Vectors and lists be indexed by vectors of logicals, integers, and (if the vector or list has names) characters.
# Note 6:
# For named examples, the syntax example_list$b is essentially a short-hand for
# example_list[["b"]] (the same is true for named
# vectors).
```
00005_informalexample_2.3_of_section_2.1.2.R
```{r 00005_informalexample_2.3_of_section_2.1.2.R }
# informalexample 2.3 of section 2.1.2
# (informalexample 2.3 of section 2.1.2) : Starting with R and data : Starting with R : R programming
x <- 1:5
print(x) # Note: 1
# [1] 1 2 3 4 5
# x <- cumsumMISSPELLED(x) # Note: 2
# Error in cumsumMISSPELLED(x) : could not find function "cumsumMISSPELLED"
print(x) # Note: 3
# [1] 1 2 3 4 5
x <- cumsum(x) # Note: 4
print(x)
# [1] 1 3 6 10 15
# Note 1:
# Define a value we are interested in and store it in the variable x.
# Note 2:
# Attempt, and fail, to assign a new result to x.
# Note 3:
# Notice in addition to supply a useful error message, R preserved the original value of x.
# Note 4:
# Try the operation again, using the correct spelling of cumsum(). cumsum(), short for cumulative sum, is a useful function that computes running totals quickly.
```
00006_informalexample_2.4_of_section_2.1.2.R
```{r 00006_informalexample_2.4_of_section_2.1.2.R }
# informalexample 2.4 of section 2.1.2
# (informalexample 2.4 of section 2.1.2) : Starting with R and data : Starting with R : R programming
nchar("a string")
# [1] 8
nchar(c("a", "aa", "aaa", "aaaa"))
# [1] 1 2 3 4
```
00007_informalexample_2.5_of_section_2.1.2.R
```{r 00007_informalexample_2.5_of_section_2.1.2.R }
# informalexample 2.5 of section 2.1.2
# (informalexample 2.5 of section 2.1.2) : Starting with R and data : Starting with R : R programming
1 +
2
```
00008_informalexample_2.6_of_section_2.1.2.R
```{r 00008_informalexample_2.6_of_section_2.1.2.R }
# informalexample 2.6 of section 2.1.2
# (informalexample 2.6 of section 2.1.2) : Starting with R and data : Starting with R : R programming
1
+ 2
```
00009_informalexample_2.7_of_section_2.1.2.R
```{r 00009_informalexample_2.7_of_section_2.1.2.R }
# informalexample 2.7 of section 2.1.2
# (informalexample 2.7 of section 2.1.2) : Starting with R and data : Starting with R : R programming
d <- data.frame(x = c(1, NA, 3)) # Note: 1
print(d)
# x
# 1 1
# 2 NA
# 3 3 # Note: 2
d$x[is.na(d$x)] <- 0 # Note: 3
print(d)
# x
# 1 1
# 2 0
# 3 3
# Note 1:
# “data.frame” is R’s tabular data type, and the most important data type in R. A data.frame
# holds data organized in rows and columns.
# Note 2:
# When printing data.frames, row numbers are shown in the fist (unnamed) column, and column
# values are shown under their matching column
# names.
# Note 3:
# We can place a slice or selection of the x column of d on the left-hand side of an assignment to easily replace all NA values with zero.
```
00010_informalexample_2.8_of_section_2.1.2.R
```{r 00010_informalexample_2.8_of_section_2.1.2.R }
# informalexample 2.8 of section 2.1.2
# (informalexample 2.8 of section 2.1.2) : Starting with R and data : Starting with R : R programming
d <- data.frame(x = 1, y = 2) # Note: 1
d2 <- d # Note: 2
d$x <- 5 # Note: 3
print(d)
# x y
# 1 5 2
print(d2)
# x y
# 1 1 2
# Note 1:
# Create some example data and refer to it by the name d.
# Note 2:
# Create an additional reference d2 to the same data.
# Note 3:
# Alter the value referred to by d.
```
00011_informalexample_2.9_of_section_2.1.2.R
```{r 00011_informalexample_2.9_of_section_2.1.2.R }
# informalexample 2.9 of section 2.1.2
# (informalexample 2.9 of section 2.1.2) : Starting with R and data : Starting with R : R programming
data <- data.frame(revenue = c(2, 1, 2), # Note: 1
sort_key = c("b", "c", "a"),
stringsAsFactors = FALSE)
print(data)
# revenue sort_key
# 1 2 b
# 2 1 c
# 3 2 a
. <- data # Note: 2
. <- .[order(.$sort_key), , drop = FALSE] # Note: 3
.$ordered_sum_revenue <- cumsum(.$revenue)
.$fraction_revenue_seen <- .$ordered_sum_revenue/sum(.$revenue)
result <- . # Note: 4
print(result)
# revenue sort_key ordered_sum_revenue fraction_revenue_seen
# 3 2 a 2 0.4
# 1 2 b 4 0.8
# 2 1 c 5 1.0
# Note 1:
# Our notional, or example data.
# Note 2:
# Assign our data to a temporary variable named “.”. The original values will remain available
# in the “data” variable, making it easy to re-start
# the calculation from the beginning if
# necessary.
# Note 3:
# Use the order command to sort the rows. The “drop = FALSE” is not strictly needed, but it is
# good to get in the habit of including it. For
# single column data.frames without the “drop =
# FALSE” argument, the “[,]” indexing operator will
# convert the result to a vector, which is almost
# never the R user's true intent. The “drop = FALSE”
# argument turns off this conversion, and it is a
# good idea to include it “just in case” and a
# definite requirement when either the data.frame
# has a single column or when we don’t know if the
# data.frame has more than one column (as the
# data.frame comes from somewhere else).
# Note 4:
# Assign the result away from “.” to a more memorable variable name.
```
00012_informalexample_2.10_of_section_2.1.2.R
```{r 00012_informalexample_2.10_of_section_2.1.2.R }
# informalexample 2.10 of section 2.1.2
# (informalexample 2.10 of section 2.1.2) : Starting with R and data : Starting with R : R programming
library("dplyr")
result <- data %>%
arrange(., sort_key) %>%
mutate(., ordered_sum_revenue = cumsum(revenue)) %>%
mutate(., fraction_revenue_seen = ordered_sum_revenue/sum(revenue))
```
00013_informalexample_2.11_of_section_2.1.2.R
```{r 00013_informalexample_2.11_of_section_2.1.2.R }
# informalexample 2.11 of section 2.1.2
# (informalexample 2.11 of section 2.1.2) : Starting with R and data : Starting with R : R programming
d <- data.frame(col1 = c(1, 2, 3), col2 = c(-1, 0, 1))
d$col3 <- d$col1 + d$col2
print(d)
# col1 col2 col3
# 1 1 -1 0
# 2 2 0 2
# 3 3 1 4
```
00015_example_2.1_of_section_2.2.1.R
```{r 00015_example_2.1_of_section_2.2.1.R }
# example 2.1 of section 2.2.1
# (example 2.1 of section 2.2.1) : Starting with R and data : Working with data from files : Working with well-structured data from files or URLs
# Title: Reading the UCI car data
uciCar <- read.table( # Note: 1
'../UCICar/car.data.csv', # Note: 2
sep = ',', # Note: 3
header = TRUE, # Note: 4
stringsAsFactor = TRUE # Note: 5
)
View(uciCar) # Note: 6
# Note 1:
# Command to read from a file or URL and store the result in a new data frame object
# called
# uciCar.
# Note 2:
# Filename or URL to get the data from.
# Note 3:
# Specify the column or field separator as a
# comma.
# Note 4:
# Tell R to expect a header line that defines
# the data column names.
# Note 5:
# Tell R to convert string values to factors. This is the default behavior, so we are just using this argument to document intent.
# Note 6:
# Examine the data with R’s built-in
# table viewer.
```
00016_example_2.2_of_section_2.2.1.R
```{r 00016_example_2.2_of_section_2.2.1.R }
# example 2.2 of section 2.2.1
# (example 2.2 of section 2.2.1) : Starting with R and data : Working with data from files : Working with well-structured data from files or URLs
# Title: Exploring the car data
class(uciCar)
## [1] "data.frame" # Note: 1
summary(uciCar)
## buying maint doors
## high :432 high :432 2 :432
## low :432 low :432 3 :432
## med :432 med :432 4 :432
## vhigh:432 vhigh:432 5more:432
##
## persons lug_boot safety
## 2 :576 big :576 high:576
## 4 :576 med :576 low :576
## more:576 small:576 med :576
##
## rating
## acc : 384
## good : 69
## unacc:1210
## vgood: 65
dim(uciCar)
## [1] 1728 7 # Note: 2
# Note 1:
# The loaded object uciCar is of type data.frame.
# Note 2:
# The “[1]” is just an output sequence
# marker. The actual information is this: uciCar has
# 1728 rows and 7 columns. Always try to confirm you
# got a good parse by at least checking that the
# number of rows is exactly one fewer than the
# number of lines of text in the original file. The
# difference of one is because the column header
# counts as a line of text, but not as a data row.
```
00018_example_2.3_of_section_2.2.2.R
```{r 00018_example_2.3_of_section_2.2.2.R }
# example 2.3 of section 2.2.2
# (example 2.3 of section 2.2.2) : Starting with R and data : Working with data from files : Using R with less-structured data
# Title: Loading the credit dataset
setwd("../../PDSwR2/Statlog") # Note: 1
d <- read.table('../Statlog/german.data', sep=' ',
stringsAsFactors = FALSE, header = FALSE)
# Note 1:
# Replace this path with the actual path where you have saved PDSwR2.
```
00019_example_2.4_of_section_2.2.2.R
```{r 00019_example_2.4_of_section_2.2.2.R }
# example 2.4 of section 2.2.2
# (example 2.4 of section 2.2.2) : Starting with R and data : Working with data from files : Using R with less-structured data
# Title: Setting column names
d <- read.table('../Statlog/german.data',
sep = " ",
stringsAsFactors = FALSE, header = FALSE)
colnames(d) <- c('Status_of_existing_checking_account', 'Duration_in_month',
'Credit_history', 'Purpose', 'Credit_amount', 'Savings_account_bonds',
'Present_employment_since',
'Installment_rate_in_percentage_of_disposable_income',
'Personal_status_and_sex', 'Other_debtors_guarantors',
'Present_residence_since', 'Property', 'Age_in_years',
'Other_installment_plans', 'Housing',
'Number_of_existing_credits_at_this_bank', 'Job',
'Number_of_people_being_liable_to_provide_maintenance_for',
'Telephone', 'foreign_worker', 'Good_Loan')
str(d)
## 'data.frame': 1000 obs. of 21 variables:
## $ Status_of_existing_checking_account : chr "A11" "A12" "A14" "A11" ...
## $ Duration_in_month : int 6 48 12 42 24 36 24 36 12 30 ...
## $ Credit_history : chr "A34" "A32" "A34" "A32" ...
## $ Purpose : chr "A43" "A43" "A46" "A42" ...
## $ Credit_amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ Savings_account_bonds : chr "A65" "A61" "A61" "A61" ...
## $ Present_employment_since : chr "A75" "A73" "A74" "A74" ...
## $ Installment_rate_in_percentage_of_disposable_income : int 4 2 2 2 3 2 3 2 2 4 ...
## $ Personal_status_and_sex : chr "A93" "A92" "A93" "A93" ...
## $ Other_debtors_guarantors : chr "A101" "A101" "A101" "A103" ...
## $ Present_residence_since : int 4 2 3 4 4 4 4 2 4 2 ...
## $ Property : chr "A121" "A121" "A121" "A122" ...
## $ Age_in_years : int 67 22 49 45 53 35 53 35 61 28 ...
## $ Other_installment_plans : chr "A143" "A143" "A143" "A143" ...
## $ Housing : chr "A152" "A152" "A152" "A153" ...
## $ Number_of_existing_credits_at_this_bank : int 2 1 1 1 2 1 1 1 1 2 ...
## $ Job : chr "A173" "A173" "A172" "A173" ...
## $ Number_of_people_being_liable_to_provide_maintenance_for: int 1 1 2 2 2 2 1 1 1 1 ...
## $ Telephone : chr "A192" "A191" "A191" "A191" ...
## $ foreign_worker : chr "A201" "A201" "A201" "A201" ...
## $ Good_Loan : int 1 2 1 1 2 1 1 1 1 2 ...
```
00021_example_2.5_of_section_2.2.2.R
```{r 00021_example_2.5_of_section_2.2.2.R }
# example 2.5 of section 2.2.2
# (example 2.5 of section 2.2.2) : Starting with R and data : Working with data from files : Using R with less-structured data
# Title: Transforming the car data
source("../Statlog/mapping.R") # Note: 1
for(ci in colnames(d)) { # Note: 2
if(is.character(d[[ci]])) {
d[[ci]] <- as.factor(mapping[d[[ci]]]) # Note: 3
}
}
# Note 1:
# This file can be found at https://github.com/WinVector/PDSwR2/blob/master/Statlog/../Statlog/mapping.R .
# Note 2:
# Prefer using column names to column indices.
# Note 3:
# The [[]] notation is using the fact
# that data.frames are named lists of columns. So we
# are working on each column in turn. Notice the
# mapping lookup is vectorized: i.e. it is applied
# to all elements in the column in one step.
```
00022_example_2.6_of_section_2.2.2.R
```{r 00022_example_2.6_of_section_2.2.2.R }
# example 2.6 of section 2.2.2
# (example 2.6 of section 2.2.2) : Starting with R and data : Working with data from files : Using R with less-structured data
# Title: Summary of Good_Loan and Purpose
setwd("../../PDSwR2/Statlog") # Note: 1
d <- readRDS("creditdata.RDS") # Note: 2
table(d$Purpose, d$Good_Loan)
## BadLoan GoodLoan
## business 34 63
## car (new) 89 145
## car (used) 17 86
## domestic appliances 4 8
## education 22 28
## furniture/equipment 58 123
## others 5 7
## radio/television 62 218
## repairs 8 14
## retraining 1 8
# Note 1:
# Set the working directory. You will have to replace ../../PDSwR2/Statlog with the actual full path to Statlog on your machine.
# Note 2:
# Read the prepared statlog data.
```
00024_example_2.8_of_section_2.3.1.R
```{r 00024_example_2.8_of_section_2.3.1.R }
# example 2.8 of section 2.3.1
# (example 2.8 of section 2.3.1) : Starting with R and data : Working with relational databases : A production-size example
# Title: Loading data into R from a relational database
library("DBI")
library("dplyr") # Note: 1
library("rquery")
dlist <- readRDS("../PUMS/PUMSsample.RDS") # Note: 2
db <- dbConnect(RSQLite::SQLite(), ":memory:") # Note: 3
dbWriteTable(db, "dpus", as.data.frame(dlist$ss16pus)) # Note: 4
dbWriteTable(db, "dhus", as.data.frame(dlist$ss16hus))
rm(list = "dlist") # Note: 5
dbGetQuery(db, "SELECT * FROM dpus LIMIT 5") # Note: 6
dpus <- tbl(db, "dpus") # Note: 7
dhus <- tbl(db, "dhus")
print(dpus) # Note: 8
glimpse(dpus)
View(rsummary(db, "dpus")) # Note: 9
# Note 1:
# Attach some packages we wish to use commands
# and functions from.
# Note 2:
# Load the data from the compressed RDS disk
# format into R memory. Note: you will need to
# change the path PUMSsample to where you have saved
# the contents of PDSwR2/PUMS.
# Note 3:
# Connect to a new RSQLite in-memory database.
# We are going to use RSQLite for our examples. In
# practice you would connect to a pre-existing
# database such as PostgreSQL or Spark with
# pre-existing tables.
# Note 4:
# Copy the data from the in-memory structure
# dlist into the database.
# Note 5:
# Remove our local copy of the data, as we are simulating having found the data in the database.
# Note 6:
# Use the SQL query language for a quick look at up to 5 rows of our data.
# Note 7:
# Build dplyr handles that refer to the remote database data.
# Note 8:
# Use dplyr to examine and work with the remote data.
# Note 9:
# Use the rquery package to get a summary of the remote data.
```
00025_example_2.9_of_section_2.3.1.R
```{r 00025_example_2.9_of_section_2.3.1.R }
# example 2.9 of section 2.3.1
# (example 2.9 of section 2.3.1) : Starting with R and data : Working with relational databases : A production-size example
# Title: Loading data from a database
dpus <- dbReadTable(db, "dpus") # Note: 1
dpus <- dpus[, c("AGEP", "COW", "ESR", "PERNP",
"PINCP","SCHL", "SEX", "WKHP")] # Note: 2
for(ci in c("AGEP", "PERNP", "PINCP", "WKHP")) { # Note: 3
dpus[[ci]] <- as.numeric(dpus[[ci]])
}
dpus$COW <- strtrim(dpus$COW, 50) # Note: 4
str(dpus) # Note: 5
# Note 1:
# Copy data from the database into R memory. This assumes we are continuing from the previous
# example, so the packages we have attached are
# still available and the database handle
# db is still valid.
# Note 2:
# Select a subset of columns we want to work with. Restricting columns is not required, but
# improves legibility of later printing.
# Note 3:
# All of the columns in this copy of PUMS data are stored as character type to preserve features such as leading zeros
# from the original data. Here we are converting columns we wish to treat as numeric to the numeric type. Non-numeric values, often missing entries,
# get coded with the symbol NA which stands for not available.
# Note 4:
# The PUMS level names are very long (which is one of the reasons these columns are distributed
# as integers), so for this dataset that has level
# names instead of level codes we are shortening the
# employment codes to no more than 50
# characters.
# Note 5:
# Look at the first few rows of data in a column orientation.
```
00026_example_2.10_of_section_2.3.1.R
```{r 00026_example_2.10_of_section_2.3.1.R }
# example 2.10 of section 2.3.1
# (example 2.10 of section 2.3.1) : Starting with R and data : Working with relational databases : A production-size example
# Title: Re-mapping values and selecting rows from data
target_emp_levs <- c( # Note: 1
"Employee of a private for-profit company or busine",
"Employee of a private not-for-profit, tax-exempt, ",
"Federal government employee",
"Local government employee (city, county, etc.)",
"Self-employed in own incorporated business, profes",
"Self-employed in own not incorporated business, pr",
"State government employee")
complete <- complete.cases(dpus) # Note: 2
stdworker <- with(dpus, # Note: 3
(PINCP>1000) &
(ESR=="Civilian employed, at work") &
(PINCP<=250000) &
(PERNP>1000) & (PERNP<=250000) &
(WKHP>=30) &
(AGEP>=18) & (AGEP<=65) &
(COW %in% target_emp_levs))
dpus <- dpus[complete & stdworker, , drop = FALSE] # Note: 4
no_advanced_degree <- is.na(dpus$SCHL) | # Note: 5
(!(dpus$SCHL %in% c("Associate's degree",
"Bachelor's degree",
"Doctorate degree",
"Master's degree",
"Professional degree beyond a bachelor's degree")))
dpus$SCHL[no_advanced_degree] <- "No Advanced Degree"
dpus$SCHL <- relevel(factor(dpus$SCHL), # Note: 6
"No Advanced Degree")
dpus$COW <- relevel(factor(dpus$COW),
target_emp_levs[[1]])
dpus$ESR <- relevel(factor(dpus$ESR),
"Civilian employed, at work")
dpus$SEX <- relevel(factor(dpus$SEX),
"Male")
saveRDS(dpus, "dpus_std_employee.RDS") # Note: 7
summary(dpus) # Note: 8
# Note 1:
# Define a vector of employment definitions we consider “standard.”
# Note 2:
# Build a new logical vector indicating which rows have valid values in all of our columns of
# interest. In real applications, dealing with
# missing values is important and cannot always be
# handled by skipping incomplete rows. We will
# return to the issue of properly dealing with
# missing values when we discuss managing
# data.
# Note 3:
# Build a new logical vector indicating which workers we consider typical full-time employees.
# Each of these column names are the ones we
# discussed earlier. The results of any analysis
# will be heavily influenced by this definition, so
# in a real task we would spend a lot of time
# researching the choices in this step. It literally
# controls who and what we are studying. Notice to
# keep things simple and homogeneous we restricted
# this study to civilians, which would be an
# unacceptable limitation in a complete work.
# Note 4:
# Restrict only to rows or examples that meet or definition of typical worker.
# Note 5:
# Re-code education, merging the less-than-bachelor’s-degree levels to the single level “No
# Advanced Degree”.
# Note 6:
# Convert our string-valued columns to factors, picking the reference level with the re-level()
# function.
# Note 7:
# Save this data to a file so we can use it in later examples. This file is also already
# available at the path
# PDSwR2/PUMS/dpus_std_employee.RDS
# Note 8:
# Take a look at our data. One of the advantages of factors is summary() builds up useful counts
# for them. However, it was best to delay having
# string codes as factors until after we finished
# with re-mapping level codes.
```
00027_informalexample_2.15_of_section_2.3.1.R
```{r 00027_informalexample_2.15_of_section_2.3.1.R }
# informalexample 2.15 of section 2.3.1
# (informalexample 2.15 of section 2.3.1) : Starting with R and data : Working with relational databases : A production-size example
levels(dpus$SCHL) # Note: 1
## [1] "No Advanced Degree" "Associate's degree"
## [3] "Bachelor's degree" "Doctorate degree"
## [5] "Master's degree" "Professional degree beyond a bachelor's degree"
head(dpus$SCHL) # Note: 2
## [1] Associate's degree Associate's degree Associate's degree No Advanced Degree Doctorate degree Associate's degree
## 6 Levels: No Advanced Degree Associate's degree Bachelor's degree Doctorate degree ... Professional degree beyond a bachelor's degree
str(dpus$SCHL) # Note: 3
## Factor w/ 6 levels "No Advanced Degree",..: 2 2 2 1 4 2 1 5 1 1 ...
# Note 1:
# Show the possible levels for SCHL.
# Note 2:
# Show the first few string values for SCHL.
# Note 3:
# Show how the first few levels are represented as codes.
```
00028_informalexample_2.16_of_section_2.3.1.R
```{r 00028_informalexample_2.16_of_section_2.3.1.R }
# informalexample 2.16 of section 2.3.1
# (informalexample 2.16 of section 2.3.1) : Starting with R and data : Working with relational databases : A production-size example
d <- cbind( # Note: 1
data.frame(SCHL = as.character(dpus$SCHL), # Note: 2
stringsAsFactors = FALSE),
model.matrix(~SCHL, dpus) # Note: 3
)
d$'(Intercept)' <- NULL # Note: 4
str(d) # Note: 5
## 'data.frame': 41305 obs. of 6 variables:
## $ SCHL : chr "Associate's degree" "Associate's degree" "Associate's degree" "No Advanced Degree" ...
## $ SCHLAssociate's degree : num 1 1 1 0 0 1 0 0 0 0 ...
## $ SCHLBachelor's degree : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SCHLDoctorate degree : num 0 0 0 0 1 0 0 0 0 0 ...
## $ SCHLMaster's degree : num 0 0 0 0 0 0 0 1 0 0 ...
## $ SCHLProfessional degree beyond a bachelor's degree: num 0 0 0 0 0 0 0 0 0 0 ...
# Note 1:
# The cbind operator combines two data frames by columns, or each row is built by matching columns from rows in each data frame.
# Note 2:
# Build a data.frame with the SCHL column re-coded as character strings instead of as a factor.
# Note 3:
# Build a matrix with dummy variables generated from the SCHL factor column.
# Note 4:
# Remove a column named "(Intercept)" from the data.frame, as it is a side-effect of model.matrix we are not interested in at this time.
# Note 5:
# Show the structure that presents the original SCHL string form along with the indicators.
# str() presents the first few rows in transpose format (flipped so rows are now up and down and columns are across).
```
00029_informalexample_2.17_of_section_2.3.1.R
```{r 00029_informalexample_2.17_of_section_2.3.1.R }
# informalexample 2.17 of section 2.3.1
# (informalexample 2.17 of section 2.3.1) : Starting with R and data : Working with relational databases : A production-size example
table(schooling = dpus$SCHL, sex = dpus$SEX) # Note: 1
## sex
## schooling Male Female
## No Advanced Degree 13178 9350
## Associate's degree 1796 2088
## Bachelor's degree 4927 4519
## Doctorate degree 361 269
## Master's degree 1792 2225
## Professional degree beyond a bachelor's degree 421 379
tapply( # Note: 2
dpus$PINCP, # Note: 3
list(dpus$SCHL, dpus$SEX), # Note: 4
FUN = mean # Note: 5
)
## Male Female
## No Advanced Degree 44304.21 33117.37
## Associate's degree 56971.93 42002.06
## Bachelor's degree 76111.84 57260.44
## Doctorate degree 104943.33 89336.99
## Master's degree 94663.41 69104.54
## Professional degree beyond a bachelor's degree 111047.26 92071.56
# Note 1:
# Use the table command to count how even each pair of SCHL and SEX occurs.
# Note 2:
# Use tapply to tally how often each pair of SCHL of SEX occurs
# Note 3:
# This argument is the vector of data we are aggregating or summarizing in the tapply.
# Note 4:
# This argument list specifies how we are grouping the data, in this case simultaneously by SCHL and SEX.
# Note 5:
# The argument specifies how we are aggregating values, in this case we are taking the mean or average using the mean function.
```
00030_informalexample_2.18_of_section_2.3.1.R
```{r 00030_informalexample_2.18_of_section_2.3.1.R }
# informalexample 2.18 of section 2.3.1
# (informalexample 2.18 of section 2.3.1) : Starting with R and data : Working with relational databases : A production-size example
library("dplyr")
dpus %>%
group_by(., SCHL, SEX) %>%
summarize(.,
count = n(),
mean_income = mean(PINCP)) %>%
ungroup(.) %>%
arrange(., SCHL, SEX)
## # A tibble: 12 x 4
## SCHL SEX count mean_income
## <fct> <fct> <int> <dbl>
## 1 No Advanced Degree Male 13178 44304.
## 2 No Advanced Degree Female 9350 33117.