-
Notifications
You must be signed in to change notification settings - Fork 8
/
wordcount.hql
30 lines (26 loc) · 1.05 KB
/
wordcount.hql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/*
* Wordcount in HiveQL
* Tested on Hive version 0.9
* Author: Jason Levitt
* Date: May 15th, 2013
* Note: Probably only works with ASCII English text
*/
-- Delete the tables if they already exist
DROP TABLE myinput;
DROP TABLE wordcount;
CREATE TABLE myinput (line STRING);
-- Load the text from the local (Linux) filesystem. This should be changed to HDFS
-- for any serious usage
LOAD DATA LOCAL INPATH '/user/someperson/mytext.txt' INTO TABLE myinput;
-- Create a table with the words cleaned and counted.
-- The Java regex removes all punctuation and control characters.
CREATE TABLE wordcount AS
SELECT word, count(1) AS count
FROM (SELECT EXPLODE(SPLIT(LCASE(REGEXP_REPLACE(line,'[\\p{Punct},\\p{Cntrl}]','')),' '))
AS word FROM myinput) words
GROUP BY word
-- Sort the output by count with the highest counts first
ORDER BY count DESC, word ASC;
-- Make the output look like the output of the Pig DUMP function
-- so that we can diff this output with the Pig wordcount output
SELECT CONCAT_WS(',', CONCAT("\(",word), CONCAT(count,"\)")) FROM wordcount;