标志位对查询性能的影响,示例数据库采用PostgreSQL

drop table if exists  test;
drop function if exists gen_random_zh(int,int);
drop function if exists get_next_index(tweights[]);
drop type if exists  tweights;
/****************************************************************************************
    创建平滑加权轮询系数类型
        weight:设置的系数
        curweight:当前使用的系数,初始化设置为0即可
****************************************************************************************/
create type tweights as(weight integer,curweight integer);
/****************************************************************************************
    平滑加权轮询（smooth weighted round-robin balancing）算法
    示例: array[((50,0)::tweights),((30,0)::tweights),((15,0)::tweights),((5,0)::tweights)]
            配置了4个系数参数,注意所有系数值累加为100,每调用一百次
                第一个系数返回索引1的概率为50%
                第二个系数返回索引2的概率为30%
                第三个系数返回索引3的概率为15%
                第四个系数返回索引4的概率为5%
****************************************************************************************/
create or replace function get_next_index(tweights[])
  returns table(index integer, weights tweights[])
as $$
    declare
        v_i integer;
        v_len integer;
        v_index integer;
        v_total integer;
        v_tmp tweights;
        v_tmpindex tweights;
    begin
        v_len := array_length($1,1); 
        if (1 = v_len) then
          return query select 1,$1;
        end if;
        v_index := -1; v_total := 0;

        for v_i in 1..v_len loop
          v_tmp := $1[v_i];
          v_tmp.curweight := (v_tmp.curweight + v_tmp.weight);
          v_total := (v_total + v_tmp.weight);
          $1[v_i] = v_tmp;
          if (-1 = v_index or ($1[v_index]).curweight < v_tmp.curweight) then
            v_index := v_i;
          end if;
        end loop;

        v_tmpindex := $1[v_index];
        v_tmpindex.curweight :=  v_tmpindex.curweight - v_total;
        $1[v_index] = v_tmpindex;
        return query select v_index,$1;
    end;
$$ language plpgsql strict;
/****************************************************************************************
    检查函数返回概率是否正确
****************************************************************************************/
with recursive paging(id,index,weights) as (
    (select 1,index,weights from get_next_index(array[((30,0)::tweights),((25,0)::tweights),((20,0)::tweights),((15,0)::tweights),((10,0)::tweights)]))  
    union all
    select (p.id+1),a.index,a.weights as tmp from paging as p,get_next_index(p.weights) as a where p.id  < 100000
) select index,count(*) from paging group by index order by index;

/****************************************************************************************
    创建随机生成中文字符函数
drop function if exists gen_random_zh(int,int);
****************************************************************************************/
create or replace function gen_random_zh(int,int)
    returns text
as $$
	select string_agg(chr((random()*(20901-19968)+19968 )::integer) , '')  from generate_series(1,(random()*($2-$1)+$1)::integer);
$$ language sql;

/****************************************************************************************
    创建测试表,使用我们经常使用的表字段定义
    在测试表中测试标志位字段(flag)对索引的影响
drop table if exists  test;
****************************************************************************************/
create table test(
    objectid bigserial not null,                    --唯一编号，主键
    name text not null,                         --名称
    describe text,                              --备注
    flag integer default(0) check(flag>-1 and flag <3) not null,    --标志位
                                                --共设置3个状态,各个状态的占比为 50%,40%,10%
                                                --0:正常(50%),1:已修改(40%),2:已删除(10%)
    generate timestamptz default now() not null,--创建日期
    constraint pk_test_objectid primary key(objectid)
);
create index idx_test_flag on test(flag);
/****************************************************************************************
    导入测试数据,每次导入50W测试数据,执行二次
****************************************************************************************/
do $$
	declare 
		v_flags integer[];
	begin	
		v_flags := array[0,1,2];
		for i in 1..500 loop			
			with recursive paging(id,index,weights) as (
    			(select 1,index,weights from get_next_index(array[((50,0)::tweights),((40,0)::tweights),((10,0)::tweights)]))	 
    			union all
    			select (p.id+1),a.index,a.weights as tmp from paging as p,get_next_index(p.weights) as a where p.id  < 1000
			) insert into test(name,flag)
				select gen_random_zh(8,32),v_flags[index]  from paging;
		end loop;
	end;
$$;
/****************************************************************************************
    为保证执行计划准确,数据导入完成后执行vacuum
****************************************************************************************/
vacuum full freeze VERBOSE analyze test;
/****************************************************************************************
    计算标志位占比
****************************************************************************************/
select flag,count(*) from test group by flag order by flag;
/****************************************************************************************
    执行计划
****************************************************************************************/
--不会使用索引
explain (analyze,verbose,costs,buffers,timing) 
select * from test where flag=0;

--不会使用索引
explain (analyze,verbose,costs,buffers,timing) 
select * from test where flag=1;

--使用了索引
explain (analyze,verbose,costs,buffers,timing) 
select * from test where flag=2;

结论:

1.不要使用标志位,不要使用标志位,不要使用标志位,重要的事情说三遍;
2.如果非要使用请使用条件索引,例如:create index idx_test_flag on test(flag) where flag=2;
3.类似的需求可以建三张一样的表,然后根据不同的动作将数据移动到不同的表里(此时不需要flag字段),具体可参考PostgreSQL 字典表设计;
4.实际应用中”(条件值/总数据)<10%”索引可更高效的发挥作用,例如”where flag=2”,2这个值只点总数据的10%,经验值是<5%最佳.经验值B树索引时,占比大约40%时索引失效(虽然有索引,已经非常慢了);
5.在实际应用中没法控制各标志位的占比,因此在设计表前必须考虑清楚.

数据表标志位对查询性能的影响

结论:

猜你喜欢