-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimporting from sql.R
198 lines (140 loc) · 6.83 KB
/
importing from sql.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#The first step to import data from a SQL database is creating a connection to it. As Filip explained, you need different packages depending on the database you want to connect to. All of these packages do this in a uniform way, as specified in the DBI package.
#dbConnect() creates a connection between your R session and a SQL database. The first argument has to be a DBIdriver object, that specifies how connections are made and how data is mapped between R and the database. Specifically for MySQL databases, you can build such a driver with RMySQL::MySQL().
#If the MySQL database is a remote database hosted on a server, you'll also have to specify the following arguments in dbConnect(): dbname, host, port, user and password. Most of these details have already been provided.
# Load the DBI package
library('DBI')
# Edit dbConnect() call
con <- dbConnect(RMySQL::MySQL(),
dbname = "tweater",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# After you've successfully connected to a remote MySQL database, the next step is to see what tables the database contains. You can do this with the dbListTables() function.
# Load the DBI package
library(DBI)
# Connect to the MySQL database: con
con <- dbConnect(RMySQL::MySQL(),
dbname = "tweater",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Build a vector of table names: tables
tables = dbListTables(con)
# Display structure of tables
str(tables)
# Load the DBI package
library(DBI)
# Connect to the MySQL database: con
con <- dbConnect(RMySQL::MySQL(),
dbname = "tweater",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Import the users table from tweater: users
users = dbReadTable(con,'users')
# Print users
users
# Load the DBI package
library(DBI)
# Connect to the MySQL database: con
con <- dbConnect(RMySQL::MySQL(),
dbname = "tweater",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Get table names
table_names <- dbListTables(con)
# Import all tables
tables <- lapply(table_names, dbReadTable, conn = con)
# Print out tables
tables
#In your life as a data scientist, you'll often be working with huge databases that contain tables with millions of rows. If want to do some analyses on this data, it's possible that you only need a fraction of this data. In this case, it's a good idea to send SQL queries to your database, and only import the data you actually need into R.
# Connect to the database
library(DBI)
con <- dbConnect(RMySQL::MySQL(),
dbname = "tweater",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Import tweat_id column of comments where user_id is 1: elisabeth
elisabeth <- dbGetQuery(con,"SELECT tweat_id FROM comments where user_id =1")
# Print elisabeth
elisabeth
# Connect to the database
library(DBI)
con <- dbConnect(RMySQL::MySQL(),
dbname = "tweater",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Import post column of tweats where date is higher than '2015-09-21': latest
latest <- dbGetQuery(con,"SELECT post FROM tweats where date > '2015-09-21' ")
# Print latest
latest
# Connect to the database
library(DBI)
con <- dbConnect(RMySQL::MySQL(),
dbname = "tweater",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Create data frame specific
specific <- dbGetQuery(con,"SELECT message FROM comments WHERE tweat_id =77 AND user_id >4")
# Print specific
specific
# Connect to the database
library(DBI)
con <- dbConnect(RMySQL::MySQL(),
dbname = "tweater",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Create data frame short
short <- dbGetQuery(con,"SELECT id, name FROM users WHERE CHAR_LENGTH(name)<5")
# Print short
short
#You've used dbGetQuery() multiple times now. This is a virtual function from the DBI package, but is actually implemented by the RMySQL package. Behind the scenes, the following steps are performed:
#Sending the specified query with dbSendQuery();
#Fetching the result of executing the query on the database with dbFetch();
#Clearing the result with dbClearResult().
#Let's not use dbGetQuery() this time and implement the steps above. This is tedious to write, but it gives you the ability to fetch the query's result in chunks rather than all at once. You can do this by specifying the n argument inside dbFetch().
# Connect to the database
library(DBI)
con <- dbConnect(RMySQL::MySQL(),
dbname = "tweater",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Send query to the database
res <- dbSendQuery(con, "SELECT * FROM comments WHERE user_id > 4")
# Use dbFetch() twice
dbFetch(res,n=2)
dbFetch(res)
# Clear res
dbClearResult(res)
#Every time you connect to a database using dbConnect(), you're creating a new connection to the database you're referencing. RMySQL automatically specifies a maximum of open connections and closes some of the connections for you, but still: it's always polite to manually disconnect from the database afterwards. You do this with the dbDisconnect() function.
# Load RMySQL package
library(DBI)
# Connect to the database
library(DBI)
con <- dbConnect(RMySQL::MySQL(),
dbname = "tweater",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Create the data frame long_tweats
long_tweats <- dbGetQuery(con,"SELECT post,date FROM tweats WHERE CHAR_LENGTH(post)>40")
# Print long_tweats
long_tweats
# Disconnect from the database
dbDisconnect(con)