rajatrakesh · giulianoevan · Nov 21, 2018
diff --git a/bigdatalab/config/flume/conf/net.conf b/bigdatalab/config/flume/conf/net.conf
@@ -0,0 +1,23 @@
+# Naming the components on the current agent
+
+NetcatAgent.sources = Netcat
+NetcatAgent.channels = MemChannel
+NetcatAgent.sinks = LoggerSink
+
+# Describing/Configuring the source
+
+NetcatAgent.sources.Netcat.type = netcat
+NetcatAgent.sources.Netcat.bind = quickstart.cloudera
+NetcatAgent.sources.Netcat.port = 56565
+NetcatAgent.sources.Netcat.channels = MemChannel
+
+# Describing/Configuring the sink
+
+NetcatAgent.sinks.LoggerSink.type = logger
+NetcatAgent.sinks.LoggerSink.channel = MemChannel
+
+# Describing/Configuring the channel
+
+NetcatAgent.channels.MemChannel.type = memory
+NetcatAgent.channels.MemChannel.capacity = 1000
+NetcatAgent.channels.MemChannel.transactionCapacity = 100
diff --git a/bigdatalab/config/flume/conf/nethd.conf b/bigdatalab/config/flume/conf/nethd.conf
@@ -0,0 +1,21 @@
+NetcatAgent.sources = Netcat
+NetcatAgent.channels = MemChannel
+NetcatAgent.sinks = hdfssink
+
+NetcatAgent.sources.Netcat.type = netcat
+NetcatAgent.sources.Netcat.bind = quickstart.cloudera
+NetcatAgent.sources.Netcat.port = 56563
+NetcatAgent.sources.Netcat.channels = MemChannel
+
+NetcatAgent.channels.MemChannel.type = memory
+NetcatAgent.channels.MemChannel.capacity = 1000
+
+# Define a source on agent and connect to channel memoryChannel. 
+
+NetcatAgent.sinks.hdfssink.type = hdfs 
+NetcatAgent.sinks.hdfssink.channel = MemChannel 
+NetcatAgent.sinks.hdfssink.hdfs.path = /user/cloudera/flumedata/
+NetcatAgent.sinks.hdfssink.hdfs.fileType = DataStream
+NetcatAgent.sinks.hdfssink.hdfs.writeFormat = Text
+NetcatAgent.sinks.hdfssink.hdfs.filePrefix=
+NetcatAgent.sinks.hdfssink.hdfs.fileSuffix=.txt
diff --git a/bigdatalab/config/flume/conf/twitter.conf b/bigdatalab/config/flume/conf/twitter.conf
@@ -0,0 +1,28 @@
+TwitterAgent.sources = Twitter
+TwitterAgent.channels = MemChannel
+TwitterAgent.sinks = HDFS
+
+# Describing/Configuring the source
+TwitterAgent.sources.Twitter.type = org.apache.flume.source.twitter.TwitterSource
+TwitterAgent.sources.Twitter.consumerKey = NbIJpPFfD1Utc3rniAZb6LEV0
+TwitterAgent.sources.Twitter.consumerSecret = a68tIE0Br5vGjZ1WSqisSCiHj6oBHy2VmusECNfMt9YjoQWQHc
+TwitterAgent.sources.Twitter.accessToken = 1052482036764618757-2bac0mOpZz1MBWISzn4Bskftxf3Are
+TwitterAgent.sources.Twitter.accessTokenSecret = 35yKKMHGmvbx0wpnl4fuTrx2eOvPNiuiFfx7Pf3xtO6Ny
+
+TwitterAgent.sinks.HDFS.type = hdfs
+TwitterAgent.sinks.HDFS.hdfs.path = /user/cloudera/twitter_data/
+TwitterAgent.sinks.HDFS.hdfs.fileType = DataStream
+TwitterAgent.sinks.HDFS.hdfs.writeFormat = Text
+TwitterAgent.sinks.HDFS.hdfs.batchSize = 5
+TwitterAgent.sinks.HDFS.hdfs.rollSize = 0
+TwitterAgent.sinks.HDFS.hdfs.rollCount = 10
+
+# Describing/Configuring the channel
+
+TwitterAgent.channels.MemChannel.type = memory
+TwitterAgent.channels.MemChannel.capacity = 10000
+TwitterAgent.channels.MemChannel.transactionCapacity = 100
+
+# Binding the source and sink to the channel
+TwitterAgent.sources.Twitter.channels = MemChannel
+TwitterAgent.sinks.HDFS.channel = MemChannel
diff --git a/bigdatalab/config/flume/jar/twitter4j-async-4.0.4.jar b/bigdatalab/config/flume/jar/twitter4j-async-4.0.4.jar
diff --git a/bigdatalab/config/flume/jar/twitter4j-core-4.0.4.jar b/bigdatalab/config/flume/jar/twitter4j-core-4.0.4.jar
diff --git a/bigdatalab/config/flume/jar/twitter4j-media-support-4.0.4.jar b/bigdatalab/config/flume/jar/twitter4j-media-support-4.0.4.jar
diff --git a/bigdatalab/config/flume/jar/twitter4j-stream-4.0.4.jar b/bigdatalab/config/flume/jar/twitter4j-stream-4.0.4.jar
diff --git a/bigdatalab/config/hive/media_customer.ddl b/bigdatalab/config/hive/media_customer.ddl
@@ -0,0 +1,44 @@
+CREATE EXTERNAL TABLE media_demo_customer(
+cust_id int,
+last_name string,
+first_name string,
+street_address string,
+postal_code string,
+city_id int,
+city string,
+state_province_id int,
+state_province string,
+country_id int,
+country string,
+continent_id int,
+continent string,
+age int,
+commute_distance int,
+credit_balance int,
+education string,
+email string,
+full_time string,
+gender string,
+household_size int,
+income int,
+income_level string,
+insuff_funds_incidents int,
+job_type string,
+late_mort_rent_pmts int,
+marital_status string,
+mortgage_amt int,
+num_cars int,
+num_mortgages int,
+pet string,
+promotion_response int,
+rent_own string,
+seg int,
+work_experience int,
+yrs_current_employer int,
+yrs_customer int,
+yrs_residence int,
+country_code string,
+username string,
+customer_address string,
+customer_geo_geo string
+) row format delimited fields terminated by '\t' stored as textfile;
diff --git a/bigdatalab/config/hive/twitter_avro_schema.avsc b/bigdatalab/config/hive/twitter_avro_schema.avsc
@@ -0,0 +1,59 @@
+{
+"type":"record",
+"name":"Doc",
+"doc":"adoc",
+"fields":[
+{
+"name":"id",
+
+"type":"string"},
+{
+"name":"user_friends_count",
+"type":["int","null"]},
+{
+"name":"user_location",
+"type":["string","null"]},
+{
+"name":"user_description",
+"type":["string","null"]},
+{
+"name":"user_statuses_count",
+"type":["int","null"]},
+{
+"name":"user_followers_count",
+"type":["int","null"]},
+{
+"name":"user_name",
+"type":["string","null"]},
+{
+"name":"user_screen_name",
+"type":["string","null"]},
+{
+"name":"created_at",
+"type":["string","null"]},
+{
+"name":"text",
+"type":["string","null"]},
+{
+"name":"retweet_count",
+"type":["long","null"]},
+{
+"name":"retweeted",
+"type":["boolean","null"]},
+{
+"name":"in_reply_to_user_id",
+"type":["long","null"]},
+{
+"name":"source",
+"type":["string","null"]},
+{
+"name":"in_reply_to_status_id",
+"type":["long","null"]},
+{
+"name":"media_url_https",
+"type":["string","null"]},
+{
+"name":"expanded_url",
+
+"type":["string","null"]} ]
+}
diff --git a/bigdatalab/config/hive/twitter_hive.sql b/bigdatalab/config/hive/twitter_hive.sql
@@ -0,0 +1,11 @@
+CREATE EXTERNAL TABLE default.twitteravro
+ROW FORMAT SERDE
+'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS INPUTFORMAT
+'org.apache.hadoop.hive.ql.io.avro.AvroContainer
+InputFormat'
+OUTPUTFORMAT
+'org.apache.hadoop.hive.ql.io.avro.AvroContainer
+OutputFormat'
+TBLPROPERTIES
+('avro.schema.url'='/user/cloudera/bigdatalab/config/hive/twitter_avro_schema.avsc);
diff --git a/bigdatalab/config/kafka/properties/connect-file-sink.properties b/bigdatalab/config/kafka/properties/connect-file-sink.properties
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name=local-file-sink
+connector.class=FileStreamSink
+tasks.max=1
+file=test.sink.txt
+topics=connect-test
diff --git a/bigdatalab/config/kafka/properties/connect-file-source.properties b/bigdatalab/config/kafka/properties/connect-file-source.properties
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name=local-file-source
+connector.class=FileStreamSource
+tasks.max=1
+file=test.txt
+topic=connect-test
diff --git a/bigdatalab/config/kafka/properties/connect-standalone.properties b/bigdatalab/config/kafka/properties/connect-standalone.properties
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# These are defaults. This file just demonstrates how to override some settings.
+bootstrap.servers=quickstart.cloudera:9092
+rest.port=18083
+
+# The converters specify the format of data in Kafka and how to translate it into Connect data. Every Connect user will
+# need to configure these based on the format they want their data in when loaded from or stored into Kafka
+key.converter=org.apache.kafka.connect.json.JsonConverter
+value.converter=org.apache.kafka.connect.json.JsonConverter
+# Converter-specific settings can be passed in by prefixing the Converter's setting with the converter we want to apply
+# it to
+key.converter.schemas.enable=true
+value.converter.schemas.enable=true
+
+# The internal converter used for offsets and config data is configurable and must be specified, but most users will
+# always want to use the built-in default. Offset and config data is never visible outside of Kafka Connect in this format.
+internal.key.converter=org.apache.kafka.connect.json.JsonConverter
+internal.value.converter=org.apache.kafka.connect.json.JsonConverter
+internal.key.converter.schemas.enable=false
+internal.value.converter.schemas.enable=false
+
+offset.storage.file.filename=/tmp/connect.offsets
+# Flush much faster than normal, which is useful for testing/debugging
+offset.flush.interval.ms=10000
+
+# Set to a list of filesystem paths separated by commas (,) to enable class loading isolation for plugins
+# (connectors, converters, transformations). The list should consist of top level directories that include 
+# any combination of: 
+# a) directories immediately containing jars with plugins and their dependencies
+# b) uber-jars with plugins and their dependencies
+# c) directories immediately containing the package directory structure of classes of plugins and their dependencies
+# Note: symlinks will be followed to discover dependencies or plugins.
+# Examples: 
+# plugin.path=/usr/local/share/java,/usr/local/share/kafka/plugins,/opt/connectors,
+#plugin.path=