Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add files via upload #30

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions bigdatalab/config/flume/conf/net.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Naming the components on the current agent

NetcatAgent.sources = Netcat
NetcatAgent.channels = MemChannel
NetcatAgent.sinks = LoggerSink

# Describing/Configuring the source

NetcatAgent.sources.Netcat.type = netcat
NetcatAgent.sources.Netcat.bind = quickstart.cloudera
NetcatAgent.sources.Netcat.port = 56565
NetcatAgent.sources.Netcat.channels = MemChannel

# Describing/Configuring the sink

NetcatAgent.sinks.LoggerSink.type = logger
NetcatAgent.sinks.LoggerSink.channel = MemChannel

# Describing/Configuring the channel

NetcatAgent.channels.MemChannel.type = memory
NetcatAgent.channels.MemChannel.capacity = 1000
NetcatAgent.channels.MemChannel.transactionCapacity = 100
21 changes: 21 additions & 0 deletions bigdatalab/config/flume/conf/nethd.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
NetcatAgent.sources = Netcat
NetcatAgent.channels = MemChannel
NetcatAgent.sinks = hdfssink

NetcatAgent.sources.Netcat.type = netcat
NetcatAgent.sources.Netcat.bind = quickstart.cloudera
NetcatAgent.sources.Netcat.port = 56563
NetcatAgent.sources.Netcat.channels = MemChannel

NetcatAgent.channels.MemChannel.type = memory
NetcatAgent.channels.MemChannel.capacity = 1000

# Define a source on agent and connect to channel memoryChannel.

NetcatAgent.sinks.hdfssink.type = hdfs
NetcatAgent.sinks.hdfssink.channel = MemChannel
NetcatAgent.sinks.hdfssink.hdfs.path = /user/cloudera/flumedata/
NetcatAgent.sinks.hdfssink.hdfs.fileType = DataStream
NetcatAgent.sinks.hdfssink.hdfs.writeFormat = Text
NetcatAgent.sinks.hdfssink.hdfs.filePrefix=
NetcatAgent.sinks.hdfssink.hdfs.fileSuffix=.txt
28 changes: 28 additions & 0 deletions bigdatalab/config/flume/conf/twitter.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
TwitterAgent.sources = Twitter
TwitterAgent.channels = MemChannel
TwitterAgent.sinks = HDFS

# Describing/Configuring the source
TwitterAgent.sources.Twitter.type = org.apache.flume.source.twitter.TwitterSource
TwitterAgent.sources.Twitter.consumerKey = NbIJpPFfD1Utc3rniAZb6LEV0
TwitterAgent.sources.Twitter.consumerSecret = a68tIE0Br5vGjZ1WSqisSCiHj6oBHy2VmusECNfMt9YjoQWQHc
TwitterAgent.sources.Twitter.accessToken = 1052482036764618757-2bac0mOpZz1MBWISzn4Bskftxf3Are
TwitterAgent.sources.Twitter.accessTokenSecret = 35yKKMHGmvbx0wpnl4fuTrx2eOvPNiuiFfx7Pf3xtO6Ny

TwitterAgent.sinks.HDFS.type = hdfs
TwitterAgent.sinks.HDFS.hdfs.path = /user/cloudera/twitter_data/
TwitterAgent.sinks.HDFS.hdfs.fileType = DataStream
TwitterAgent.sinks.HDFS.hdfs.writeFormat = Text
TwitterAgent.sinks.HDFS.hdfs.batchSize = 5
TwitterAgent.sinks.HDFS.hdfs.rollSize = 0
TwitterAgent.sinks.HDFS.hdfs.rollCount = 10

# Describing/Configuring the channel

TwitterAgent.channels.MemChannel.type = memory
TwitterAgent.channels.MemChannel.capacity = 10000
TwitterAgent.channels.MemChannel.transactionCapacity = 100

# Binding the source and sink to the channel
TwitterAgent.sources.Twitter.channels = MemChannel
TwitterAgent.sinks.HDFS.channel = MemChannel
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
44 changes: 44 additions & 0 deletions bigdatalab/config/hive/media_customer.ddl
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
CREATE EXTERNAL TABLE media_demo_customer(
cust_id int,
last_name string,
first_name string,
street_address string,
postal_code string,
city_id int,
city string,
state_province_id int,
state_province string,
country_id int,
country string,
continent_id int,
continent string,
age int,
commute_distance int,
credit_balance int,
education string,
email string,
full_time string,
gender string,
household_size int,
income int,
income_level string,
insuff_funds_incidents int,
job_type string,
late_mort_rent_pmts int,
marital_status string,
mortgage_amt int,
num_cars int,
num_mortgages int,
pet string,
promotion_response int,
rent_own string,
seg int,
work_experience int,
yrs_current_employer int,
yrs_customer int,
yrs_residence int,
country_code string,
username string,
customer_address string,
customer_geo_geo string
) row format delimited fields terminated by '\t' stored as textfile;
59 changes: 59 additions & 0 deletions bigdatalab/config/hive/twitter_avro_schema.avsc
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
"type":"record",
"name":"Doc",
"doc":"adoc",
"fields":[
{
"name":"id",

"type":"string"},
{
"name":"user_friends_count",
"type":["int","null"]},
{
"name":"user_location",
"type":["string","null"]},
{
"name":"user_description",
"type":["string","null"]},
{
"name":"user_statuses_count",
"type":["int","null"]},
{
"name":"user_followers_count",
"type":["int","null"]},
{
"name":"user_name",
"type":["string","null"]},
{
"name":"user_screen_name",
"type":["string","null"]},
{
"name":"created_at",
"type":["string","null"]},
{
"name":"text",
"type":["string","null"]},
{
"name":"retweet_count",
"type":["long","null"]},
{
"name":"retweeted",
"type":["boolean","null"]},
{
"name":"in_reply_to_user_id",
"type":["long","null"]},
{
"name":"source",
"type":["string","null"]},
{
"name":"in_reply_to_status_id",
"type":["long","null"]},
{
"name":"media_url_https",
"type":["string","null"]},
{
"name":"expanded_url",

"type":["string","null"]} ]
}
11 changes: 11 additions & 0 deletions bigdatalab/config/hive/twitter_hive.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
CREATE EXTERNAL TABLE default.twitteravro
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainer
InputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainer
OutputFormat'
TBLPROPERTIES
('avro.schema.url'='/user/cloudera/bigdatalab/config/hive/twitter_avro_schema.avsc);
20 changes: 20 additions & 0 deletions bigdatalab/config/kafka/properties/connect-file-sink.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name=local-file-sink
connector.class=FileStreamSink
tasks.max=1
file=test.sink.txt
topics=connect-test
20 changes: 20 additions & 0 deletions bigdatalab/config/kafka/properties/connect-file-source.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name=local-file-source
connector.class=FileStreamSource
tasks.max=1
file=test.txt
topic=connect-test
49 changes: 49 additions & 0 deletions bigdatalab/config/kafka/properties/connect-standalone.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# These are defaults. This file just demonstrates how to override some settings.
bootstrap.servers=quickstart.cloudera:9092
rest.port=18083

# The converters specify the format of data in Kafka and how to translate it into Connect data. Every Connect user will
# need to configure these based on the format they want their data in when loaded from or stored into Kafka
key.converter=org.apache.kafka.connect.json.JsonConverter
value.converter=org.apache.kafka.connect.json.JsonConverter
# Converter-specific settings can be passed in by prefixing the Converter's setting with the converter we want to apply
# it to
key.converter.schemas.enable=true
value.converter.schemas.enable=true

# The internal converter used for offsets and config data is configurable and must be specified, but most users will
# always want to use the built-in default. Offset and config data is never visible outside of Kafka Connect in this format.
internal.key.converter=org.apache.kafka.connect.json.JsonConverter
internal.value.converter=org.apache.kafka.connect.json.JsonConverter
internal.key.converter.schemas.enable=false
internal.value.converter.schemas.enable=false

offset.storage.file.filename=/tmp/connect.offsets
# Flush much faster than normal, which is useful for testing/debugging
offset.flush.interval.ms=10000

# Set to a list of filesystem paths separated by commas (,) to enable class loading isolation for plugins
# (connectors, converters, transformations). The list should consist of top level directories that include
# any combination of:
# a) directories immediately containing jars with plugins and their dependencies
# b) uber-jars with plugins and their dependencies
# c) directories immediately containing the package directory structure of classes of plugins and their dependencies
# Note: symlinks will be followed to discover dependencies or plugins.
# Examples:
# plugin.path=/usr/local/share/java,/usr/local/share/kafka/plugins,/opt/connectors,
#plugin.path=
Loading