分组依据子句

描述

GROUP BY 子句用于根据一组指定的分组表达式对行进行分组,并基于一个或多个指定的聚合函数对该分组的行计算聚合。Spark 还支持高级聚合,通过 GROUPING SETS CUBE ROLLUP 子句对同一输入记录集进行多个聚合。分组表达式和高级聚合可以在 GROUP BY 子句中混合使用,并可以嵌套在 GROUPING SETS 子句中。有关更多细节,请参见 Mixed/Nested Grouping Analytics 部分。当一个 FILTER 子句附加到聚合函数时,仅匹配的行会传递给该函数。

语法

GROUP BY group_expression [ , group_expression [ , ... ] ] [ WITH { ROLLUP | CUBE } ]
GROUP BY { group_expression | { ROLLUP | CUBE | GROUPING SETS } (grouping_set [ , ...]) } [ , ... ]

聚合函数被定义为

aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_expression ) ]

参数

示例

CREATE TABLE dealer (id INT, city STRING, car_model STRING, quantity INT);
INSERT INTO dealer VALUES
(100, 'Fremont', 'Honda Civic', 10),
(100, 'Fremont', 'Honda Accord', 15),
(100, , 'Honda CRV', 7),
(200, , , 20),
(200, , , 10),
(200, , , 3),
(300, , , 5),
(300, , , 8);
-- 每个经销商的数量总和。按 `id` 分组。
SELECT id, sum(quantity) FROM dealer GROUP BY id ORDER BY id;
+---+-------------+
| id|sum(quantity)|
+---+-------------+
|100| 32|
|200| 33|
|300| 13|
+---+-------------+
-- 在 GROUP by 子句中使用列位置。
SELECT id, sum(quantity) FROM dealer GROUP BY 1 ORDER BY 1;
+---+-------------+
| id|sum(quantity)|
+---+-------------+
|100| 32|
|200| 33|
|300| 13|
+---+-------------+
-- 多重聚合。
-- 1. 每个经销商的数量总和。
-- 2. 每个经销商的最大数量。
SELECT id, sum(quantity) AS sum, max(quantity) AS max FROM dealer GROUP BY id ORDER BY id;
+---+---+---+
| id|sum|max|
+---+---+---+
|100| 32| 15|
|200| 33| 20|
|300| 13| 8|
+---+---+---+
-- 计算每个汽车型号的不同经销商城市数量。
SELECT car_model, count(DISTINCT city) AS count FROM dealer GROUP BY car_model;
+------------+-----+
| car_model|count|
+------------+-----+
| Honda Civic| 3|
| Honda CRV| 2|
|Honda Accord| 3|
+------------+-----+
-- 仅对 'Honda Civic' 和 'Honda CRV' 的每个经销商的数量求和。
SELECT id, sum(quantity) FILTER (
WHERE car_model IN ('Honda Civic', )
) AS `sum(quantity)` FROM dealer
GROUP BY id ORDER BY id;
+---+-------------+
| id|sum(quantity)|
+---+-------------+
|100| 17|
|200| 23|
|300| 5|
+---+-------------+
-- 在单个语句中使用多个分组列进行聚合。
-- 以下根据四组分组列执行聚合。
-- 1. city, car_model
-- 2. city
-- 3. car_model
-- 4. 空分组集。返回所有城市和汽车型号的数量。
SELECT city, car_model, sum(quantity) AS sum FROM dealer
GROUP BY GROUPING SETS ((city, car_model), (city), (car_model), ())
ORDER BY city;
+---------+------------+---+
| city| car_model|sum|
+---------+------------+---+
| null| null| 78|
| null| HondaAccord| 33|
| null| HondaCRV| 10|
| null| HondaCivic| 35|
| Dublin| null| 33|
| Dublin| HondaAccord| 10|
| Dublin| HondaCRV| 3|
| Dublin| HondaCivic| 20|
| Fremont| null| 32|
| Fremont| HondaAccord| 15|
| Fremont| HondaCRV| 7|
| Fremont| HondaCivic| 10|
| San Jose| null| 13|
| San Jose| HondaAccord| 8|
| San Jose| HondaCivic| 5|
+---------+------------+---+
-- 使用 `ROLLUP` 子句进行分组处理。
-- 等价于 GROUP BY GROUPING SETS ((city, car_model), (city), ())
SELECT city, car_model, sum(quantity) AS sum FROM dealer
GROUP BY city, car_model WITH ROLLUP
ORDER BY city, car_model;
+---------+------------+---+
| city| car_model|sum|
+---------+------------+---+
| null| null| 78|
| Dublin| null| 33|
| Dublin| HondaAccord| 10|
| Dublin| HondaCRV| 3|
| Dublin| HondaCivic| 20|
| Fremont| null| 32|
| Fremont| HondaAccord| 15|
| Fremont| HondaCRV| 7|
| Fremont| HondaCivic| 10|
| San Jose| null| 13|
| San Jose| HondaAccord| 8|
| San Jose| HondaCivic| 5|
+---------+------------+---+
-- 使用 `CUBE` 子句进行分组处理。
-- 等价于 GROUP BY GROUPING SETS ((city, car_model), (city), (car_model), ())
SELECT city, car_model, sum(quantity) AS sum FROM dealer
GROUP BY city, car_model WITH CUBE
ORDER BY city, car_model;
+---------+------------+---+
| city| car_model|sum|
+---------+------------+---+
| null| null| 78|
| null| HondaAccord| 33|
| null| HondaCRV| 10|
| null| HondaCivic| 35|
| Dublin| null| 33|
| Dublin| HondaAccord| 10|
| Dublin| HondaCRV| 3|
| Dublin| HondaCivic| 20|
| Fremont| null| 32|
| Fremont| HondaAccord| 15|
| Fremont| HondaCRV| 7|
| Fremont| HondaCivic| 10|
| San Jose| null| 13|
| San Jose| HondaAccord| 8|
| San Jose| HondaCivic| 5|
+---------+------------+---+
-- 准备数据以进行忽略空值的示例
CREATE TABLE person (id INT, name STRING, age INT);
INSERT INTO person VALUES
(100, , NULL),
(200, , 30),
(300, , 80),
(400, , 50);
-- 选择列 age 中的第一行
SELECT FIRST(age) FROM person;
+--------------------+
| first(age, false) |
+--------------------+
| NULL |
+--------------------+
-- 获取列 `age` 中的第一行,忽略空值,列 `id` 中的最后一行,以及列 `id` 的总和。
SELECT FIRST(age IGNORE NULLS), LAST(id), SUM(id) FROM person;
+-------------------+------------------+----------+
| first(age, true) | last(id, false) | sum(id) |
+-------------------+------------------+----------+
| 30 | 400 | 1000 |
+-------------------+------------------+----------+