概述
基本语法
[WITH name AS (select_expression) [, ...] ]
SELECT
[ALL | DISTINCT]
[STRAIGHT_JOIN]
expression [, expression ...]
FROM table_reference [, table_reference ...]
[[FULL | [LEFT | RIGHT] INNER | [LEFT | RIGHT] OUTER | [LEFT | RIGHT] SEMI | [LEFT | RIGHT] ANTI | CROSS]
JOIN table_reference
[ON join_equality_clauses | USING (col1[, col2 ...]] ...
WHERE conditions
GROUP BY { column | expression [, ...] }
HAVING conditions
ORDER BY { column | expression [ASC | DESC] [NULLS FIRST | NULLS LAST] [, ...] }
LIMIT expression [OFFSET expression]
[UNION [ALL] select_statement] ...]
table_reference := { table_name | (subquery) }
[ TABLESAMPLE SYSTEM(percentage) [REPEATABLE(seed)] ]
joins
使用join关键字
可以使用ON
关键字做链接也可以使用USING
关键字连接
SELECT t1.c1, t2.c2 FROM t1 JOIN t2
ON t1.id = t2.id and t1.type_flag = t2.type_flag
WHERE t1.c1 > 100;
SELECT t1.c1, t2.c2 FROM t1 JOIN t2
USING (id, type_flag)
WHERE t1.c1 > 100;
使用","分隔形式
该方法可读性不是很高,且当改动SQL条件时容易将连接条件误删
SELECT t1.c1, t2.c2 FROM t1, t2
WHERE
t1.id = t2.id AND t1.type_flag = t2.type_flag
AND t1.c1 > 100;
自连接
SELECT lhs.id, rhs.parent, lhs.c1, rhs.c2 FROM tree_data lhs, tree_data rhs WHERE lhs.id = rhs.parent;
外连接和内连接
内连接的INNER
可以省略,使用内连接时,必须两表同时存在数据,任意一方没有数据,都会被过滤掉。
SELECT t1.id, c1, c2 FROM t1, t2 WHERE t1.id = t2.id;
SELECT t1.id, c1, c2 FROM t1 JOIN t2 ON t1.id = t2.id;
SELECT t1.id, c1, c2 FROM t1 INNER JOIN t2 ON t1.id = t2.id;
连接有LEFT
,RIGHT
和FULL
,如果没有这些关键词,默认为LEFT
连接,即如果右侧的表没有对应数据则为null。
SELECT * FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.id;
SELECT * FROM t1 RIGHT OUTER JOIN t2 ON t1.id = t2.id;
SELECT * FROM t1 FULL OUTER JOIN t2 ON t1.id = t2.id;
非等值连接
前面所有的ON
,USING
,WHERE
在左右表进行相等比较,这种的被成为等值连接,impala现在还支持非等值连接,但是这类查询要小心使用,避免产生巨大结果集。
SELECT * FROM t1 CROSS JOIN t2 WHERE t1.total > t2.maximum_price;
Semi-joins
SEMI JOIN
这种变体很少使用,左半连接对于右表中匹配到的行,只返回左表中的数据,无论右表中匹配到了多少行,只返回左表中的一行。右半则刚好相反。
SELECT t1.c1, t1.c2, t1.c2 FROM t1 LEFT SEMI JOIN t2 ON t1.id = t2.id;
Anti-joins
ANTI JOIN
这种连接方式,返回的是没有匹配到的数据,比如LEFT ANTI JOIN
返回的就是左表中在右表没有匹配到的数据。
order by
该语句会将数据在每个节点排好序,然后将数据发送到coordinator
节点做最终的排序,如果你想要的是前N项,可以直接添加limit
关键字,这也会减少排序使用的资源量。
语法
ORDER BY col_ref [, col_ref ...] [ASC | DESC] [NULLS FIRST | NULLS LAST]
col_ref ::= column_name | integer_literal
可以使用order by 1
来按照第一列进行排序,使用数字的方式指定相应的列,但是不能使用非数字的列排序,比如string
,这可能会导致查询结果是成功的,但是结果不是你想要的;另外select *
也不能使用该方式表达。
当查询中没有limit
关键字时,impala查询出一个巨大的结果集,从而内存放不下时,会使用磁盘的空间来进行排序。
复杂类型的排序
如果直接使用复杂类型,会导致查询失败。
CREATE TABLE games (id BIGINT, score ARRAY <BIGINT>) STORED AS PARQUET;
...use LOAD DATA to load externally created Parquet files into the table...
SELECT id FROM games ORDER BY score DESC;
ERROR: AnalysisException: ORDER BY expression 'score' with complex type 'ARRAY<BIGINT>' is not supported.
必须要将数据join到一起,才能使用排序。
SELECT id, item FROM games, games.score
WHERE item > 1000000
ORDER BY id, item desc;
该item字段就是数组中的每个数据。更多复杂类型使用,见下面的例子。
CREATE TABLE games2 (id BIGINT, play array < struct <game_name: string, score: BIGINT, high_score: boolean> >) STORED AS PARQUET
...use LOAD DATA to load externally created Parquet files into the table...
SELECT id, item.game_name, item.score FROM games2, games2.play
WHERE item.score > 1000000
ORDER BY id, item.score DESC;
CREATE TABLE games3 (id BIGINT, play ARRAY < MAP <STRING, BIGINT> >) STORED AS PARQUET;
...use LOAD DATA to load externally created Parquet files into the table...
SELECT id, info.key AS k, info.value AS v from games3, games3.play AS plays, games3.play.item AS info
WHERE info.KEY = 'score' AND info.VALUE > 1000000
ORDER BY id, info.value desc;
impala的排序、聚合等操作如果数据量过大,都会将数据写到磁盘中,默认情况下,会写到/tmp/impala-scratch
目录。也可以启动impalad进程时使用‑‑scratch_dirs="path_to_directory"
参数进行配置。
可以使用NULLS FIRST
或者NULLS LAST
来控制空值在排序中的位置。
[localhost:21000] > create table numbers (x int);
[localhost:21000] > insert into numbers values (1), (null), (2), (null), (3);
[localhost:21000] > select x from numbers order by x nulls first;
+------+
| x
|
+------+
| NULL |
| NULL |
| 1
|
| 2
|
| 3
|
+------+
[localhost:21000] > select x from numbers order by x desc nulls first;
+------+
| x
|
+------+
| NULL |
| NULL |
| 3
|
| 2
|
| 1
|
+------+
[localhost:21000] > select x from numbers order by x nulls last;
+------+
| x
|
+------+
| 1
|
| 2
|
| 3
|
| NULL |
| NULL |
+------+
[localhost:21000] > select x from numbers order by x desc nulls last;
+------+
| x
|
+------+
| 3
|
| 2
|
| 1
|
| NULL |
| NULL |
+------+
group by
group by 可以配合聚合函数使用,还可以使用having关键字过滤聚合函数的值。
查询销售量最低的5个产品。
select
ss_item_sk as Item,
count(ss_item_sk) as Times_Purchased,
sum(ss_quantity) as Total_Quantity_Purchased
from store_sales
group by ss_item_sk
having count(ss_item_sk)>= 100
order by sum(ss_quantity)
limit 5;
+-------+-----------------+--------------------------+
| item
| times_purchased | total_quantity_purchased |
+-------+-----------------+--------------------------+
| 13943 | 105
| 4087
|
| 2992
| 101
| 4176
|
| 4773
| 107
| 4204
|
| 14350 | 103
| 4260
|
| 11956 | 102
| 4275
|
+-------+-----------------+--------------------------+
limit
SELECT x FROM t1 LIMIT 1e6;
-- 前100万
SELECT x FROM t1 LIMIT length('hello world');
-- 前11
SELECT x FROM t1 LIMIT 2+2;
-- 前4
SELECT x FROM t1 LIMIT cast(truncate(9.9) AS INT); -- 前9
搭配offset
关键词,达到分页效果。
[localhost:21000] > select x from numbers order by x limit 5 offset 0;
+----+
| x
|
+----+
| 1
|
| 2
|
| 3
|
| 4
|
| 5
|
+----+
[localhost:21000] > select x from numbers order by x limit 5 offset 5;
+----+
| x
|
+----+
| 6
|
| 7
|
| 8
|
| 9
|
| 10 |
+----+
union
语法
query_1 UNION [DISTINCT | ALL] query_2
union all
会返回两个表拼接出来的所有数据,union
会对数据进行去重的操作,所以当能够确定两个表没有重复数据时,建议使用union all
,union
会使用更多的资源。
例如
[localhost:21000] > select x from few_ints order by x;
+---+
| x |
+---+
| 1 |
| 1 |
| 2 |
| 3 |
+---+
Returned 4 row(s) in 0.41s
[localhost:21000] > select x from few_ints union all select x from few_ints;
+---+
| x |
+---+
| 1 |
| 1 |
| 2 |
| 3 |
| 1 |
| 1 |
| 2 |
| 3 |
+---+
Returned 8 row(s) in 0.42s
[localhost:21000] > select * from (select x from few_ints union all select x from few_ints) as t1 order by x;
+---+
| x |
+---+
| 1 |
| 1 |
| 1 |
| 1 |
| 2 |
| 2 |
| 3 |
| 3 |
+---+
Returned 8 row(s) in 0.53s
[localhost:21000] > select x from few_ints union all select 10;
+----+
| x
|
+----+
| 10 |
| 1
|
| 1
|
| 2
|
| 3
|
+----+
Returned 5 row(s) in 0.38s
[localhost:21000] > select x from few_ints union select x+1 from few_ints;
+---+
| x |
+---+
| 3 |
| 4 |
| 1 |
| 2 |
+---+
Returned 4 row(s) in 0.51s
[localhost:21000] > select x from few_ints union select 10;
+----+
| x
|
+----+
| 2
|
| 10 |
| 1
|
| 3
|
+----+
Returned 4 row(s) in 0.49s
[localhost:21000] > select * from (select x from few_ints union select x from few_ints) as t1 order by x;
+---+
| x |
+---+
| 1 |
| 2 |
| 3 |
+---+
Returned 3 row(s) in 0.53s
subqueries
子查询让SQL语句能力变得非常强大,可以使用很多非常复杂的过滤。
-- 直接判断条件是否正确
SELECT x FROM t1 WHERE x > (SELECT MAX(y) FROM t2);
-- 使用EXISTS关键字,判断是否成立
SELECT count(x) FROM t1 WHERE EXISTS(SELECT 1 FROM t2 WHERE t1.x = t2.y * 10);
-- 用在from后和join后
SELECT avg(t1.x), max(t2.y) FROM
(SELECT id, cast(a AS DECIMAL(10,5)) AS x FROM raw_data WHERE a BETWEEN 0 AND 100) AS t1
JOIN
(SELECT id, length(s) AS y FROM raw_data WHERE s LIKE 'A%') AS t2;
USING (id);
tablesample
语法
TABLESAMPLE SYSTEM(percentage) [REPEATABLE(seed)]
该关键词作用是在库中对数据进行抽样,可以指定一个数字,最终表示抽样大概的百分比,例如percentage指定为50则会在数据中抽出50%的数据。
with
with是用来简化语句查询的,类似临时的一个视图,使你在实际语句中表达的更简洁。
with t1 as (select 1), t2 as (select 2) insert into tab select * from t1 union all select * from t2;
with t1 as (select 1) (with t2 as (select 2) select * from t2) union all select * from t1;
DISTINCT
distinct是指返回不同的数据,将结果集进行去重。
-- 对单个字段进行去重
SELECT DISTINCT c_birth_country FROM customer;
-- 对多个字段进行去重
SELECT DISTINCT c_salutation, c_last_name FROM customer;
-- 获取不重复的出生地的数量
SELECT COUNT(DISTINCT c_birth_country) FROM customer;
-- 获取多个字段不重复的数量
SELECT COUNT(DISTINCT c_salutation, c_last_name) FROM customer;
最后
以上就是心灵美老鼠为你收集整理的impala系列:服务API--select的全部内容,希望文章能够帮你解决impala系列:服务API--select所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复