Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor order-by fd check #14

Open
wants to merge 13 commits into
base: functional-dependency
Choose a base branch
from
413 changes: 351 additions & 62 deletions planner/core/logical_plan_builder.go

Large diffs are not rendered by default.

70 changes: 48 additions & 22 deletions planner/core/logical_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
package core

import (
"math"

"github.com/pingcap/tidb/expression"
"github.com/pingcap/tidb/expression/aggregation"
"github.com/pingcap/tidb/infoschema"
Expand All @@ -34,6 +32,7 @@ import (
"github.com/pingcap/tidb/util/logutil"
"github.com/pingcap/tidb/util/ranger"
"go.uber.org/zap"
"math"
)

var (
Expand Down Expand Up @@ -198,6 +197,11 @@ func (p *LogicalJoin) extractFDForSemiJoin(filtersFromApply []expression.Express
// 1: since semi join will keep the part or all rows of the outer table, it's outer FD can be saved.
// 2: the un-projected column will be left for the upper layer projection or already be pruned from bottom up.
outerFD, _ := p.children[0].ExtractFD(), p.children[1].ExtractFD()
outerAcrossBlock := p.SelectBlockOffset() != p.children[0].SelectBlockOffset()
if outerAcrossBlock {
outerFD.HasAggBuilt = false
outerFD.GroupByCols.Clear()
}
fds := outerFD

eqCondSlice := expression.ScalarFuncs2Exprs(p.EqualConditions)
Expand All @@ -215,6 +219,11 @@ func (p *LogicalJoin) extractFDForSemiJoin(filtersFromApply []expression.Express

func (p *LogicalJoin) extractFDForInnerJoin(filtersFromApply []expression.Expression) *fd.FDSet {
leftFD, rightFD := p.children[0].ExtractFD(), p.children[1].ExtractFD()
leftAcrossBlock, rightAcrossBlock := p.SelectBlockOffset() != p.children[0].SelectBlockOffset(), p.SelectBlockOffset() != p.children[1].SelectBlockOffset()
if leftAcrossBlock {
leftFD.HasAggBuilt = false
leftFD.GroupByCols.Clear()
}
fds := leftFD
fds.MakeCartesianProduct(rightFD)

Expand Down Expand Up @@ -245,16 +254,19 @@ func (p *LogicalJoin) extractFDForInnerJoin(filtersFromApply []expression.Expres
fds.HashCodeToUniqueID[k] = v
}
}
for i, ok := rightFD.GroupByCols.Next(0); ok; i, ok = rightFD.GroupByCols.Next(i + 1) {
fds.GroupByCols.Insert(i)
if !rightAcrossBlock {
for i, ok := rightFD.GroupByCols.Next(0); ok; i, ok = rightFD.GroupByCols.Next(i + 1) {
fds.GroupByCols.Insert(i)
}
fds.HasAggBuilt = fds.HasAggBuilt || rightFD.HasAggBuilt
}
fds.HasAggBuilt = fds.HasAggBuilt || rightFD.HasAggBuilt
p.fdSet = fds
return fds
}

func (p *LogicalJoin) extractFDForOuterJoin(filtersFromApply []expression.Expression) *fd.FDSet {
outerFD, innerFD := p.children[0].ExtractFD(), p.children[1].ExtractFD()
outerAcrossBlock, innerAcrossBlock := p.SelectBlockOffset() != p.children[0].SelectBlockOffset(), p.SelectBlockOffset() != p.children[1].SelectBlockOffset()
innerCondition := p.RightConditions
outerCondition := p.LeftConditions
outerCols, innerCols := fd.NewFastIntSet(), fd.NewFastIntSet()
Expand All @@ -266,6 +278,7 @@ func (p *LogicalJoin) extractFDForOuterJoin(filtersFromApply []expression.Expres
}
if p.JoinType == RightOuterJoin {
innerFD, outerFD = outerFD, innerFD
outerAcrossBlock, innerAcrossBlock = innerAcrossBlock, outerAcrossBlock
innerCondition = p.LeftConditions
outerCondition = p.RightConditions
innerCols, outerCols = outerCols, innerCols
Expand Down Expand Up @@ -346,9 +359,13 @@ func (p *LogicalJoin) extractFDForOuterJoin(filtersFromApply []expression.Expres
}
}
}

if outerAcrossBlock {
outerFD.HasAggBuilt = false
outerFD.GroupByCols.Clear()
}
fds := outerFD
fds.MakeOuterJoin(innerFD, filterFD, outerCols, innerCols, &opt)

fds.MakeOuterJoin(innerFD, filterFD, outerCols, innerCols, &opt, innerAcrossBlock)
p.fdSet = fds
return fds
}
Expand Down Expand Up @@ -578,6 +595,13 @@ func (p *LogicalProjection) ExtractFD() *fd.FDSet {
fds.MakeNotNull(notnullColsUniqueIDs)
// select max(a) from t group by b, we should project both `a` & `b` to maintain the FD down here, even if select-fields only contain `a`.
fds.ProjectCols(outputColsUniqueIDs.Union(fds.GroupByCols))
if fds.HasAggBuilt && fds.GroupByCols.Only1Zero() && p.baseLogicalPlan.FDChecked {
// maxOneRow is delayed from agg's ExtractFD logic since some details listed in it.
fds.MaxOneRow(outputColsUniqueIDs)
// for select * from view (include agg), outer projection don't have to check select list with the inner group-by flag.
fds.HasAggBuilt = false
fds.GroupByCols.Clear()
}
// just trace it down in every operator for test checking.
p.fdSet = fds
return fds
Expand Down Expand Up @@ -966,7 +990,7 @@ func (p *LogicalSelection) ExtractFD() *fd.FDSet {
// join's schema will miss t2.a while join.full schema has. since selection
// itself doesn't contain schema, extracting schema should tell them apart.
var columns []*expression.Column
if join, ok := p.children[0].(*LogicalJoin); ok {
if join, ok := p.children[0].(*LogicalJoin); ok && join.fullSchema != nil {
columns = join.fullSchema.Columns
} else {
columns = p.Schema().Columns
Expand All @@ -984,19 +1008,6 @@ func (p *LogicalSelection) ExtractFD() *fd.FDSet {
// extract equivalence cols.
equivUniqueIDs := extractEquivalenceCols(p.Conditions, p.SCtx(), fds)

// after left join, according to rule 3.3.3, it may create a lax FD from inner equivalence
// cols pointing to outer equivalence cols. eg: t left join t1 on t.a = t1.b, leading a
// lax FD from t1.b ~> t.a, this lax attribute is coming from supplied null value to all
// left rows, once there is a null-refusing predicate on the inner side on upper layer, this
// can be equivalence again. (the outer rows left are all coming from equal matching)
//
// why not just makeNotNull of them, because even a non-equiv-related inner col can also
// refuse supplied null values.
if fds.Rule333Equiv.InnerCols.Len() != 0 && notnullColsUniqueIDs.Intersects(fds.Rule333Equiv.InnerCols) {
// restore/re-strength FDs from rule 333
fds.MakeRestoreRule333()
}

// apply operator's characteristic's FD setting.
fds.MakeNotNull(notnullColsUniqueIDs)
fds.AddConstants(constUniqueIDs)
Expand Down Expand Up @@ -1059,11 +1070,26 @@ func (la *LogicalApply) ExtractFD() *fd.FDSet {
}
}
}
// select (select t1.a from t1 where t1.rid = t2.id), count(t2.b) from t2 group by (t2.id)
// for correlated scalar sub-query, the whole sub-query will be projected as a new column for example here.
// while for every same t2.id, this sub-query's scalar output must be the same, actually it's a kind of strict FD here.
applyStrictDetermine := fd.NewFastIntSet()
applyStrictDependency := fd.NewFastIntSet()
if innerPlan.Schema().Len() == 1 && len(deduplicateCorrelatedCols) > 0 {
// single column in apply join inner side will be output directly.
for _, cc := range deduplicateCorrelatedCols {
applyStrictDetermine.Insert(int(cc.UniqueID))
}
applyStrictDependency.Insert(int(innerPlan.Schema().Columns[0].UniqueID))
}

switch la.JoinType {
case InnerJoin:
return la.extractFDForInnerJoin(eqCond)
case LeftOuterJoin, RightOuterJoin:
return la.extractFDForOuterJoin(eqCond)
fds := la.extractFDForOuterJoin(eqCond)
fds.AddStrictFunctionalDependency(applyStrictDetermine, applyStrictDependency)
return fds
case SemiJoin:
return la.extractFDForSemiJoin(eqCond)
default:
Expand Down
10 changes: 8 additions & 2 deletions planner/core/plan.go
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,8 @@ type baseLogicalPlan struct {
// including eliminating unnecessary DISTINCT operators, simplifying ORDER BY columns,
// removing Max1Row operators, and mapping semi-joins to inner-joins.
// for now, it's hard to maintain in individual operator, build it from bottom up when using.
fdSet *fd.FDSet
fdSet *fd.FDSet
FDChecked bool
}

// ExtractFD return the children[0]'s fdSet if there are no adding/removing fd in this logic plan.
Expand All @@ -386,8 +387,13 @@ func (p *baseLogicalPlan) ExtractFD() *fd.FDSet {
return p.fdSet
}
fds := &fd.FDSet{HashCodeToUniqueID: make(map[string]int)}
// isolation between different logical query blocks.
acrossBlock := false
winoros marked this conversation as resolved.
Show resolved Hide resolved
for _, ch := range p.children {
fds.AddFrom(ch.ExtractFD())
if p.SelectBlockOffset() != ch.SelectBlockOffset() {
acrossBlock = true
}
fds.AddFrom(ch.ExtractFD(), acrossBlock)
}
return fds
}
Expand Down
151 changes: 151 additions & 0 deletions planner/funcdep/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,154 @@ package funcdep
// https://cs.uwaterloo.ca/research/tr/2000/11/CS-2000-11.thesis.pdf

// TODO: Add the RFC design.

// NOTE 1.
// when handling Lax FD, we don't care the null value in the dependency, which means
// as long as null-attribute coverage of the determinant can make a Lax FD as strict one.

// The definition of "lax" used in the paper differs from the definition used by this
// library. For a lax dependency A~~>B, the paper allows this set of rows:
//
// a b
// -------
// 1 1
// 1 NULL
//
// This alternate definition is briefly covered in section 2.5.3.2 of the paper (see definition
// 2.19). The reason for this change is to allow a lax dependency to be upgraded to a strict
// dependency more readily, needing only the determinant columns to be not-null rather than
// both determinant and dependant columns.
//
// This is on the condition that, for definite values of determinant of a Lax FD, it won't
// have two same definite dependant value. That's true, because there is no way can derive
// to this kind of FD.
//
// Even in our implementation of outer join, the only way to produce duplicate definite
// determinant is the join predicate. But for now, we only maintain the equivalence and
// some strict FD of it.
//
// t(a,b) left join t1(c,d,e) on t.a = t1.c and b=1
// a b | c d e
// ------+----------------
// 1 1 | 1 NULL 1
// 1 2 | NULL NULL NULL
// 2 1 | NULL NULL NULL
//
// Actually it's possible, the lax FD {a} -> {c} can be derived but not that useful. we only
// maintain the {c} ~> {a} for existence after outer join. Besides, there two Cond-FD should
// be preserved waiting for be visible again once with the null-reject on the condition of
// null constraint columns. (see below)
//
// NOTE 2.
// When handle outer join, it won't produce lax FD with duplicate definite determinant values and
// different dependency values.
//
// In implementation,we come across some lax FD dependent on null-reject of some other cols. For
// example.
// t(a,b) left join t1(c,d,e) on t.a = t1.c and b=1
// a b | c d e
// ------+----------------
// 1 1 | 1 NULL 1
// 1 2 | NULL NULL NULL
// 2 1 | NULL NULL NULL
//
// here constant FD {} -> {b} won't be existed after the outer join is done. Notice null-constraint
// {c,d,e} -| {c,d,e}, this FD should be preserved and will be visible again when some null-reject
// predicate take effect on the null-constraint cols.
//
// It's same for strict equivalence {t.a} = {t1.c}. Notice there are no lax equivalence here, because
// left side couldn't be guaranteed to be definite or null. like a=2 here. Let's collect all of this
// on-condition FD down, correspondent with a null-constraints column set, name it as Cond-FD.
//
// lax equivalencies are theoretically possible, but it won't be constructed from an outer join unless
// t already has a constant FD in column `a` here before outer join take a run. So the lax equivalence
// has some pre-conditions as you see, and it couldn't cover the case shown above. Let us do it like a
// Cond-FD does.
//
// The FD constructed from the join predicate should be considered as Cond-FD. Here like equivalence of
// {a} == {c} and constant FD {b} = 1 (if the join condition is e=1, it's here too). We can say that for
// every matched row, this FDs is valid, while for the other rows, the inner side are supplied of null
// rows. So this FDs are stored as ncEdges with nc condition of all inner table cols.
//
// We introduced invisible FD with null-constraint column to solve the problem above named as Cond-FD.
// For multi embedded left join, we take the following case as an example.
// a,b c,d,e
// -----------+-----------
// 1 2 | 1 1 1
// 2 2 |
// -----------+-----------
//
// left join on (a=c) res:
// a b c e e
// -------------------------
// 1 2 1 1 1
// 2 2 +- null null null -+
// | |
// +-------------------+
// \
// \
// the Cond-FD are < a=c with {c,d,e} > the latter is as null constraint cols
//
// e,f
// -----------------------
// 1 2
// 2 2
// 3 3
// -----------------------
//
// left join on (e=a) res:
// e f a b c d e
// -----------------------------------
// 1 2 1 2 1 1 1
// 2 2 2 2 +- null null null --+---------------> Cond-FD are <a=c with {c,d,e}> still exists.
// 3 3 +-null null | null null null |---+
// | +-------------------+ |
// +-----------------------------------+-----------> New Cond-FD are <e=a with {a,b,c,d,e}> occurs.
//
//
// the old Cond-FD with null constraint columns set {c,d,e} is preserved cause new append cols are all null too.
// the new Cond-FD with null constraint columns set {a,b,c,d,e} are also meaningful, even if the null-reject column
// is one of {c,d,e} which may reduce one of the matched row out of the result, the equivalence {a}={e} still exist.
//
// Provide that the result of the first left join is like:
// left join on (a=c) res:
// a b c e e
// ---------------------------
// 1 2 1 1 1
// null 2 null null null
//
// THEN: left join on (e=a) res:
// e f a b c d e
// ---------------------------------
// 1 2 1 2 1 1 1
// 2 2 null null null null null
// 3 3 3 3 null null null
Copy link
Collaborator Author

@AilinKid AilinKid Apr 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

L134 should be
3 3 null null null null null

//
// Even like that, the case of old Cond-FD and new Cond-FD are existed too. Seems the null-constraint column set of
// old Cond-FD {c,d,e} can be expanded as {a,b,c,d,e} visually, but we couldn't derive the inference of the join predicate
// (e=a). The null-reject of column `a` couldn't bring the visibility to the old Cond-FD theoretically, it just happened
// to refuse that row with a null value in column a.
//
// Think about adding one more row in first left join result.
//
// left join on (a=c) res:
// a b c e e
// ---------------------------
// 1 2 1 1 1
// null 2 null null null
// 3 3 null null null
//
// THEN: left join on (e=a) res:
// e f a b c d e
// ---------------------------------
// 1 2 1 2 1 1 1
// 2 2 null null null null null
// 3 3 3 3 null null null
//
// Conclusion:
// As you see that's right we couldn't derive the inference of the join predicate (e=a) to expand old Cond-FD's nc
// {c,d,e} as {a,b,c,d,e}. So the rule for Cond-FD is quite simple, just keep the old ncEdge from right, appending
// the new ncEdges in current left join.
//
// If the first left join result is in the outer side of the second left join, just keep the ncEdge from left as well,
// appending the new ncEdges in current left join.
3 changes: 0 additions & 3 deletions planner/funcdep/extract_fd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,6 @@ func TestFDSet_MakeOuterJoin(t *testing.T) {
ctx := context.TODO()
is := testGetIS(ass, tk.Session())
for i, tt := range tests {
if i == 0 {
fmt.Println(1)
}
comment := fmt.Sprintf("case:%v sql:%s", i, tt.sql)
stmt, err := par.ParseOneStmt(tt.sql, "", "")
ass.Nil(err, comment)
Expand Down
Loading