bigdatabenc

sudo apt-get install python3-gdbm curl https://bootstrap.pypa.io/get-pip.py | python3.9 pip install urllib3==1.26.6 /usr/local/hbase/bin/hbase thrift curl https://bootstrap.pypa.io/get-pip.py | python3.9

Cheatsheet

PORTY:

50070: HDFS
8088: Apache Hadoop
9443: NiFi
18080: Apache Spark
8081: Apache Flink

Connecting to VM

Otwarcie tunelu ssh

ssh -L 2222:vl26.mini.pw.edu.pl:22 [email protected]

Example port forwarding:

ssh -L 8088:vl26.mini.pw.edu.pl:8088 [email protected]

Połączenie z VM'ka

ssh -i Desktop\big_data\private_key vagrant@localhost -p 2222

Run all services

sudo ./scripts/bootstrap.sh

HDFS commands

list files in HDFS

hdfs dfs -ls /

check free disk space

hdfs dfs -df -h

check the size of each directory

hdfs dfs -du -h /

make directory

hdfs dfs -mkdir dir_path

put file to HDFS

hdfs dfs -put file_path hdfs_path

copy from local to HDFS

hdfs dfs -copyFromLocal file_path hdfs_path

get file from HDFS

hdfs dfs -get hdfs_path file_path

copy to local from HDFS

hdfs dfs -copyToLocal hdfs_path file_path

cat file

hdfs dfs -cat hdfs_path

mv file

hdfs dfs -mv hdfs_path hdfs_path

cp file

hdfs dfs -cp hdfs_path hdfs_path

rm file

hdfs dfs -rm hdfs_path

rm -r directory

hdfs dfs -rm -r hdfs_path

REST API - CLI:

Make directory

curl -i -X PUT "vl26.mini.pw.edu.pl:50070/webhdfs/v1/user/galowskim/test6?user.name=hdfs&op=MKDIRS"

Create file

curl -i -X PUT "vl26.mini.pw.edu.pl:50070/webhdfs/v1/user/galowskim/tescik.txt?user.name=testuser&op=CREATE"

Append to file

curl -i -X PUT -T tesciiik.txt "node1:50075/webhdfs/v1/user/galowskim/tesciiik.txt?op=CREATE&user.name=testuser&namenoderpcaddress=node1:8020&overwrite=false"

Create file with input from local file

curl -i -X PUT -T Desktop\zamek_kaniowski.txt "vl26.mini.pw.edu.pl:50075/webhdfs/v1/user/galowskim/tesciiik.txt?op=CREATE&user.name=testuser&namenoderpcaddress=node1:8020&overwrite=false"

Otwieranie pliku (NIE DZIAŁA)

curl -i -L "vl26.mini.pw.edu.pl:50070/webhdfs/v1/user/galowskim/tesciiik.txt?user.name=hdfs&op=OPEN"

Otwieranie działa

curl -i -L "http://vl26.mini.pw.edu.pl:50075/webhdfs/v1/user/galowskim/tesciiik.txt?op=OPEN&user.name=testuser&namenoderpcaddress=node1:8020&offset=5"

Usuniecie katalogu

curl -i -X DELETE "vl26.mini.pw.edu.pl:50070/webhdfs/v1/user/galowskim?user.name=hdfs&op=DELETE"

REST API - Python:

import pywebhdfs

Hive commands

Server connection

hive -h <host_name> -p <port>

To quit:

quit;

Query execution

hive -e <query in quotes> or hive -f <file_name>

hive -e "select * from employees limit 10"

Basic commands

show databases

show databases;

list tables

show tables in default;

sample content

select * from employees limit 10;

save variable

select 2+3 as calculation;

parametrized script

select * from employees limit ${hivevar:ROW_LIMIT};

execute it

beeline -u jdbc:hive2://localhost:10000/ -hivevar ROW_LIMIT=10 -f test.hql

also variable can be set in script

set TEST_VAR='test';
SET hivevar:ROW_LIMIT=2;
SET;
SELECT * FROM employees LIMIT ${hivevar:ROW_LIMIT};

create table

CREATE TABLE IF NOT EXISTS wifi (
    id INT,
    name STRING,
    x_wgs84 STRING,
    y_wgs84 STRING,
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\n';
LINES TERMINATED BY '\n';

LOAD DATA LOCAL INPATH '/home/vagrant/Desktop/big_data/data/wifi.csv' OVERWRITE INTO TABLE wifi;

data insertion

FROM wifi
INSERT OVERWRITE TABLE wifi1
SELECT * where name like 'awil-%'
INSERT OVERWRITE TABLE wifi2
SELECT * where id>21 and id<=32;

Different storing techniques

as parquet

CREATE TABLE wifi_par STORED AS PARQUET as SELECT * FROM wifi;

as avro

CREATE TABLE wifi_avro STORED AS AVRO as SELECT * FROM wifi;

external tables

CREATE EXTERNAL TABLE external_table_trams (brigade INT, firstLine
INT,time TIMESTAMP,status STRING, lon DOUBLE, lat DOUBLE, line
INT, lowfloor BOOLEAN, finaltime TIMESTAMP)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\073'
LINES TERMINATED BY '\n'
LOCATION '/user/<user_name>/external_table_trams';
LOAD DATA INPATH '/user/<user_name>/trams.csv' INTO TABLE
external_table_trams;

modyfiying timestamp format

ALTER TABLE external_table_trams SET SERDEPROPERTIES
("timestamp.formats"="yyyy-MM-dd'T'HH:mm:ss,yyyy-MM-dd'T'HH:mm:ss.SSS");

IF SOMETHING DOES NOT WORK, TRY:

SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=non-strict;
SET hive.enforce.bucketing=true;

dynamic partitioning

CREATE EXTERNAL TABLE external_table_trams_part (brigade INT, firstLine INT,time TIMESTAMP, status STRING, lon DOUBLE, lat DOUBLE, line INT, lowfloor BOOLEAN, finaltime TIMESTAMP)
PARTITIONED BY (day STRING)
CLUSTERED BY (line) SORTED BY (line ASC) INTO 5 BUCKETS
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\073'
LINES TERMINATED BY '\n'
LOCATION '/user/<user_name>/external_table_trams_part';

insert data

INSERT INTO external_table_trams_part PARTITION(day)
SELECT brigade, firstLine, time, status, lon, lat, line, lowfloor, finaltime, CURRENT_DATE day FROM external_table_trams;

Name		Name	Last commit message	Last commit date
Latest commit History 4 Commits
README.md		README.md
gowno.xml		gowno.xml
message.txt		message.txt
zad2.ipynb		zad2.ipynb
zad3.ipynb		zad3.ipynb

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

bigdatabenc

Cheatsheet

Connecting to VM

HDFS commands

REST API - CLI:

REST API - Python:

Hive commands

Server connection

Query execution

Basic commands

Different storing techniques

About

Releases

Packages

Languages

hbujakow/bigdatabenc

Folders and files

Latest commit

History

Repository files navigation

bigdatabenc

Cheatsheet

Connecting to VM

HDFS commands

REST API - CLI:

REST API - Python:

Hive commands

Server connection

Query execution

Basic commands

Different storing techniques

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages