sudo apt-get install python3-gdbm curl https://bootstrap.pypa.io/get-pip.py | python3.9 pip install urllib3==1.26.6 /usr/local/hbase/bin/hbase thrift curl https://bootstrap.pypa.io/get-pip.py | python3.9
PORTY:
- 50070: HDFS
- 8088: Apache Hadoop
- 9443: NiFi
- 18080: Apache Spark
- 8081: Apache Flink
- Otwarcie tunelu ssh
ssh -L 2222:vl26.mini.pw.edu.pl:22 [email protected]
- Example port forwarding:
ssh -L 8088:vl26.mini.pw.edu.pl:8088 [email protected]
- Połączenie z VM'ka
ssh -i Desktop\big_data\private_key vagrant@localhost -p 2222
- Run all services
sudo ./scripts/bootstrap.sh
- list files in HDFS
hdfs dfs -ls /
- check free disk space
hdfs dfs -df -h
- check the size of each directory
hdfs dfs -du -h /
- make directory
hdfs dfs -mkdir dir_path
- put file to HDFS
hdfs dfs -put file_path hdfs_path
- copy from local to HDFS
hdfs dfs -copyFromLocal file_path hdfs_path
- get file from HDFS
hdfs dfs -get hdfs_path file_path
- copy to local from HDFS
hdfs dfs -copyToLocal hdfs_path file_path
- cat file
hdfs dfs -cat hdfs_path
- mv file
hdfs dfs -mv hdfs_path hdfs_path
- cp file
hdfs dfs -cp hdfs_path hdfs_path
- rm file
hdfs dfs -rm hdfs_path
- rm -r directory
hdfs dfs -rm -r hdfs_path
- Make directory
curl -i -X PUT "vl26.mini.pw.edu.pl:50070/webhdfs/v1/user/galowskim/test6?user.name=hdfs&op=MKDIRS"
- Create file
curl -i -X PUT "vl26.mini.pw.edu.pl:50070/webhdfs/v1/user/galowskim/tescik.txt?user.name=testuser&op=CREATE"
- Append to file
curl -i -X PUT -T tesciiik.txt "node1:50075/webhdfs/v1/user/galowskim/tesciiik.txt?op=CREATE&user.name=testuser&namenoderpcaddress=node1:8020&overwrite=false"
- Create file with input from local file
curl -i -X PUT -T Desktop\zamek_kaniowski.txt "vl26.mini.pw.edu.pl:50075/webhdfs/v1/user/galowskim/tesciiik.txt?op=CREATE&user.name=testuser&namenoderpcaddress=node1:8020&overwrite=false"
- Otwieranie pliku (NIE DZIAŁA)
curl -i -L "vl26.mini.pw.edu.pl:50070/webhdfs/v1/user/galowskim/tesciiik.txt?user.name=hdfs&op=OPEN"
- Otwieranie działa
curl -i -L "http://vl26.mini.pw.edu.pl:50075/webhdfs/v1/user/galowskim/tesciiik.txt?op=OPEN&user.name=testuser&namenoderpcaddress=node1:8020&offset=5"
- Usuniecie katalogu
curl -i -X DELETE "vl26.mini.pw.edu.pl:50070/webhdfs/v1/user/galowskim?user.name=hdfs&op=DELETE"
import pywebhdfs
hive -h <host_name> -p <port>
To quit:
quit;
hive -e <query in quotes> or hive -f <file_name>
hive -e "select * from employees limit 10"
- show databases
show databases;
- list tables
show tables in default;
- sample content
select * from employees limit 10;
- save variable
select 2+3 as calculation;
- parametrized script
select * from employees limit ${hivevar:ROW_LIMIT};
- execute it
beeline -u jdbc:hive2://localhost:10000/ -hivevar ROW_LIMIT=10 -f test.hql
- also variable can be set in script
set TEST_VAR='test';
SET hivevar:ROW_LIMIT=2;
SET;
SELECT * FROM employees LIMIT ${hivevar:ROW_LIMIT};
- create table
CREATE TABLE IF NOT EXISTS wifi (
id INT,
name STRING,
x_wgs84 STRING,
y_wgs84 STRING,
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\n';
LINES TERMINATED BY '\n';
LOAD DATA LOCAL INPATH '/home/vagrant/Desktop/big_data/data/wifi.csv' OVERWRITE INTO TABLE wifi;
- data insertion
FROM wifi
INSERT OVERWRITE TABLE wifi1
SELECT * where name like 'awil-%'
INSERT OVERWRITE TABLE wifi2
SELECT * where id>21 and id<=32;
- as parquet
CREATE TABLE wifi_par STORED AS PARQUET as SELECT * FROM wifi;
- as avro
CREATE TABLE wifi_avro STORED AS AVRO as SELECT * FROM wifi;
- external tables
CREATE EXTERNAL TABLE external_table_trams (brigade INT, firstLine
INT,time TIMESTAMP,status STRING, lon DOUBLE, lat DOUBLE, line
INT, lowfloor BOOLEAN, finaltime TIMESTAMP)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\073'
LINES TERMINATED BY '\n'
LOCATION '/user/<user_name>/external_table_trams';
LOAD DATA INPATH '/user/<user_name>/trams.csv' INTO TABLE
external_table_trams;
- modyfiying timestamp format
ALTER TABLE external_table_trams SET SERDEPROPERTIES
("timestamp.formats"="yyyy-MM-dd'T'HH:mm:ss,yyyy-MM-dd'T'HH:mm:ss.SSS");
IF SOMETHING DOES NOT WORK, TRY:
SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=non-strict;
SET hive.enforce.bucketing=true;
- dynamic partitioning
CREATE EXTERNAL TABLE external_table_trams_part (brigade INT, firstLine INT,time TIMESTAMP, status STRING, lon DOUBLE, lat DOUBLE, line INT, lowfloor BOOLEAN, finaltime TIMESTAMP)
PARTITIONED BY (day STRING)
CLUSTERED BY (line) SORTED BY (line ASC) INTO 5 BUCKETS
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\073'
LINES TERMINATED BY '\n'
LOCATION '/user/<user_name>/external_table_trams_part';
- insert data
INSERT INTO external_table_trams_part PARTITION(day)
SELECT brigade, firstLine, time, status, lon, lat, line, lowfloor, finaltime, CURRENT_DATE day FROM external_table_trams;