tutorial hadoop pig documentation
It is sometimes difficult for SQL users to learn Pig because their mind is used to working in SQL. In this tutorial, examples of various SQL statements are shown, and then translated into Pig statements. For more detailed documentation, please see the official Pig manual.
Please feel free to add more examples or documentation in the comments section.
SELECT
SELECT * FROM mytable; DUMP mytable;
SELECT col1, col2 FROM mytable; mytable = FOREACH mytable GENERATE col1, col2; DUMP mytable;
SELECT col1 AS new_col1, col2 AS new_col2 FROM mytable; mytable = FOREACH mytable GENERATE col1 AS new_col1, col2 AS new_col2; DUMP mytable;
SELECT col1::integer, col2::varchar FROM mytable; mytable = FOREACH mytable GENERATE (int)col1, (chararray)col2; DUMP mytable;
SELECT * FROM mytable LIMIT 10; mytable = LIMIT mytable 10; DUMP mytable;
SELECT * FROM mytable ORDER BY col1 ASC; mytable = ORDER mytable BY col1 ASC; DUMP mytable;
SELECT * FROM mytable WHERE col1 > 20; mytable = FILTER mytable BY col1 > 20; DUMP mytable;JOIN
SELECT * FROM mytable INNER JOIN othertable ON mytable.col1 = othertable.col1; mytable = JOIN mytable BY col1, othertable BY col1; DUMP mytable;
SELECT * FROM mytable LEFT OUTER JOIN othertable ON mytable.col1 = othertable.col1; mytable = JOIN mytable BY col1 LEFT OUTER, othertable BY col1; DUMP mytable;
SELECT * FROM mytable RIGHT OUTER JOIN othertable ON mytable.col1 = othertable.col1; mytable = JOIN mytable BY col1 RIGHT OUTER, othertable BY col1; DUMP mytable;
SELECT * FROM mytable FULL OUTER JOIN othertable ON mytable.col1 = othertable.col1; mytable = JOIN mytable BY col1 FULL OUTER, othertable BY col1; DUMP mytable;
SELECT * FROM mytable, othertable; mytable = CROSS mytable, othertable; DUMP mytable;GROUP BY
SELECT COUNT(*) FROM mytable; mytable = GROUP mytable ALL; mytable = FOREACH mytable GENERATE COUNT(mytable); DUMP mytable;
SELECT COUNT(DISTINCT col1) FROM mytable; mytable = FOREACH mytable GENERATE col1; mytable = DISTINCT col1; mytable = GROUP mytable BY col1; mytable = FOREACH mytable GENERATE group AS col1, COUNT(mytable) AS cnt; DUMP mytable;TABLES
CREATE TABLE newtable AS SELECT * FROM mytable; STORE mytable INTO '/some_hdfs_folder/newtable' USING PigStorage(',');
DROP TABLE newtable; RMF /some_hdfs_folder/newtable;LINKS
Pig 0.6 manual
blog comments powered by Disqus
