这是两个查询的查询计划。正如@thebluephantom 所说,它们是相同的,所以应该没有任何性能差异。
create table t1 (a int, b int, c int, d int);
explain select a,b,c,d from t1 group by 1,2,3,4;
== Physical Plan ==
*(2) HashAggregate(keys=[a#14, b#15, c#16, d#17], functions=[])
+- Exchange hashpartitioning(a#14, b#15, c#16, d#17, 200), true, [id=#33]
+- *(1) HashAggregate(keys=[a#14, b#15, c#16, d#17], functions=[])
+- Scan hive default.t1 [a#14, b#15, c#16, d#17], HiveTableRelation `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#14, b#15, c#16, d#17], Statistics(sizeInBytes=8.0 EiB)
explain select distinct a,b,c,d from t1;
== Physical Plan ==
*(2) HashAggregate(keys=[a#23, b#24, c#25, d#26], functions=[])
+- Exchange hashpartitioning(a#23, b#24, c#25, d#26, 200), true, [id=#58]
+- *(1) HashAggregate(keys=[a#23, b#24, c#25, d#26], functions=[])
+- Scan hive default.t1 [a#23, b#24, c#25, d#26], HiveTableRelation `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#23, b#24, c#25, d#26], Statistics(sizeInBytes=8.0 EiB)
扩展解释表明查询经过优化后变得相同:
explain extended select a,b,c,d from t1 group by 1,2,3,4;
== Parsed Logical Plan ==
'Aggregate [1, 2, 3, 4], ['a, 'b, 'c, 'd]
+- 'UnresolvedRelation [t1]
== Analyzed Logical Plan ==
a: int, b: int, c: int, d: int
Aggregate [a#41, b#42, c#43, d#44], [a#41, b#42, c#43, d#44]
+- SubqueryAlias spark_catalog.default.t1
+- HiveTableRelation `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#41, b#42, c#43, d#44], Statistics(sizeInBytes=8.0 EiB)
== Optimized Logical Plan ==
Aggregate [a#41, b#42, c#43, d#44], [a#41, b#42, c#43, d#44]
+- HiveTableRelation `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#41, b#42, c#43, d#44], Statistics(sizeInBytes=8.0 EiB)
== Physical Plan ==
*(2) HashAggregate(keys=[a#41, b#42, c#43, d#44], functions=[], output=[a#41, b#42, c#43, d#44])
+- Exchange hashpartitioning(a#41, b#42, c#43, d#44, 200), true, [id=#108]
+- *(1) HashAggregate(keys=[a#41, b#42, c#43, d#44], functions=[], output=[a#41, b#42, c#43, d#44])
+- Scan hive default.t1 [a#41, b#42, c#43, d#44], HiveTableRelation `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#41, b#42, c#43, d#44], Statistics(sizeInBytes=8.0 EiB)
explain extended select distinct a,b,c,d from t1;
== Parsed Logical Plan ==
'Distinct
+- 'Project ['a, 'b, 'c, 'd]
+- 'UnresolvedRelation [t1]
== Analyzed Logical Plan ==
a: int, b: int, c: int, d: int
Distinct
+- Project [a#50, b#51, c#52, d#53]
+- SubqueryAlias spark_catalog.default.t1
+- HiveTableRelation `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#50, b#51, c#52, d#53], Statistics(sizeInBytes=8.0 EiB)
== Optimized Logical Plan ==
Aggregate [a#50, b#51, c#52, d#53], [a#50, b#51, c#52, d#53]
+- HiveTableRelation `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#50, b#51, c#52, d#53], Statistics(sizeInBytes=8.0 EiB)
== Physical Plan ==
*(2) HashAggregate(keys=[a#50, b#51, c#52, d#53], functions=[], output=[a#50, b#51, c#52, d#53])
+- Exchange hashpartitioning(a#50, b#51, c#52, d#53, 200), true, [id=#133]
+- *(1) HashAggregate(keys=[a#50, b#51, c#52, d#53], functions=[], output=[a#50, b#51, c#52, d#53])
+- Scan hive default.t1 [a#50, b#51, c#52, d#53], HiveTableRelation `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#50, b#51, c#52, d#53], Statistics(sizeInBytes=8.0 EiB)
实际上表明查询引擎似乎更喜欢group by 查询。