当前位置: 首页 > news >正文

Postgresql 根据单列或几列分组去重row_number() over() partition by

Postgresql 根据单列或几列分组去重row_number() over() partition by

一般用于单列或者几列需要去重后进行计算值的

count(distinct(eid)) 可以

比如有个例子,需要根据名称,城市去筛选覆盖的道路长度,以月因为建立了唯一索引是ok的,年时可能会有重复的,如何去重呢?用窗口函数:row_number() over() partition by
count(distinct(length)) 不行,因为很多道路数据本就有相同的长度

1. 效果图

可以看到 distinctCnt > Cnt说明有重复,点开string_agg的结果发现确实是有重复;, 这样计算其所对应的length值肯定偏大。
在这里插入图片描述
去重后效果图如下: 把所有的聚合条件都写在partition by后边。
可以看到后边的里程和也正常了不少。
试验发现pname有无差别不大,可能是因为构造的数据集小;,但其实是需要的;
在这里插入图片描述
以第一条数据去验证:

2. 源码

2.1 建表,构建数据

drop table if exists t_pa_cover;
create table if not exists t_pa_cover(pname text COLLATE pg_catalog."default" NOT NULL,upload_date varchar(12),city_code varchar(20) default '',link_pid varchar(20),link_length numeric default 0,create_time timestamp with time zone NOT NULL DEFAULT now(),constraint t_pa_cover_unique_key unique (pname,upload_date,city_code,link_pid)
);
COMMENT ON TABLE t_pa_cover IS '覆盖率中间表';
COMMENT ON COLUMN t_pa_cover.pname IS '名称';
COMMENT ON COLUMN t_pa_cover.upload_date IS '日期';
COMMENT ON COLUMN t_pa_cover.city_code IS '城市行政编码';
COMMENT ON COLUMN t_pa_cover.link_pid IS 'linkpid';
COMMENT ON COLUMN t_pa_cover.link_length IS 'linkpid长度m';
COMMENT ON COLUMN t_pa_cover.create_time IS '创建时间';create index if not exists t_pa_cover_citycode on t_pa_cover(city_code);
create index if not exists t_pa_cover_pname on t_pa_cover(pname);
create index if not exists t_pa_cover_uploaddate on t_pa_cover(upload_date);INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202201', '1101', 4721472607, 99.88);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202201', '1201', 4731620766, 64.96);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202201', '1301', 4725763511, 82.77);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202202', '1101', 4732413545, 23.63);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202202', '1201', 4733766774, 17.97);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202202', '1301', 4725763511, 82.77);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202203', '1101', 4732413545, 23.63);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202203', '1201', 4721472607, 99.88);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202203', '1301', 4733766774, 17.97);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202204', '1101', 4721472607, 99.88);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202204', '1201', 4738504835, 37.94);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202204', '1301', 4727435973, 39.05);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202205', '1101', 4737641033, 1.41);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202205', '1201', 4725763511, 82.77);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202205', '1301', 4727435973, 39.05);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202206', '1101', 4725763511, 82.77);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202206', '1201', 4737641033, 1.41);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202206', '1301', 4733766774, 17.97);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202207', '1101', 4725763511, 82.77);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202207', '1201', 4740662897, 86.96);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202207', '1301', 4719251580, 43.12);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202208', '1101', 4719251580, 43.12);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202208', '1201', 4727435973, 39.05);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202208', '1301', 4725763511, 82.77);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202209', '1101', 4741477663, 35.39);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202209', '1201', 4738504835, 37.94);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202209', '1301', 4740789027, 5.36);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202210', '1101', 4721472607, 99.88);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202210', '1201', 4733766774, 17.97);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202210', '1301', 4732413545, 23.63);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202211', '1101', 4719251580, 43.12);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202211', '1201', 4740789027, 5.36);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202211', '1301', 4719251580, 43.12);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202212', '1101', 4740789027, 5.36);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202212', '1201', 4740662897, 86.96);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('aa', '202212', '1301', 4721472607, 99.88);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202201', '1101', 4738492963, 10.75);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202201', '1201', 4736532327, 44.78);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202201', '1301', 4740856924, 39.60);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202202', '1101', 4739710021, 85.77);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202202', '1201', 4736532327, 44.78);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202202', '1301', 4712358476, 44.06);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202203', '1101', 4734479408, 25.51);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202203', '1201', 4738273045, 99.60);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202203', '1301', 4740856924, 39.60);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202204', '1101', 4735500946, 49.98);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202204', '1201', 4738273045, 99.60);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202204', '1301', 4736169127, 58.38);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202205', '1101', 4736797286, 26.90);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202205', '1201', 4716723755, 89.29);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202205', '1301', 4740856924, 39.60);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202206', '1101', 4738492963, 10.75);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202206', '1201', 4735500946, 49.98);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202206', '1301', 4712358476, 44.06);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202207', '1101', 4716723755, 89.29);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202207', '1201', 4740108020, 77.72);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202207', '1301', 4730167080, 0.11);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202208', '1101', 4716723755, 89.29);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202208', '1201', 4738492963, 10.75);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202208', '1301', 4730167080, 0.11);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202209', '1101', 4716723755, 89.29);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202209', '1201', 4735500946, 49.98);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202209', '1301', 4712358476, 44.06);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202210', '1101', 4736532327, 44.78);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202210', '1201', 4738273045, 99.60);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202210', '1301', 4716723755, 89.29);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202211', '1101', 4740108020, 77.72);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202211', '1201', 4740108020, 77.72);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202211', '1301', 4741340832, 83.51);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202212', '1101', 4738492963, 10.75);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202212', '1201', 4734479408, 25.51);
INSERT INTO public.t_pa_cover(pname, upload_date, city_code, link_pid, link_length) VALUES ('bb', '202212', '1301', 4741340832, 83.51);

2.2 去重与没去重——sql对比

-- 有重复
select pname,substring(upload_date,0,5) as upDate,city_code as cityCode,count(distinct(link_pid)) distinctCnt,count(link_pid) cnt,string_agg(link_pid,','),sum(link_length)
from t_pa_cover
group by pname,upDate,cityCode-- 去重后
select pname,substring(upload_date,0,5) as upDate,city_code as cityCode,count(distinct(link_pid)) distinctCnt,count(link_pid) cnt,string_agg(link_pid,','),sum(link_length)
from (select row_number() over(partition by pname,substring(upload_date,0,5),city_code,link_pid) as rn,a.*from t_pa_cover awhere substring(upload_date,0,5) ='2022'
) b 
where b.rn=1
group by pname,upDate,cityCode;

参考

  • Postgresql语句持续更新
  • Postgresql大全
  • https://blog.csdn.net/wbj3106/article/details/82109077
http://www.lryc.cn/news/3518.html

相关文章:

  • 基于蒙特卡洛法的规模化电动车有序充放电及负荷预测(PythonMatlab实现)
  • Selenium常用API详解,从入门到进阶(全套)
  • 自从学会了Python,我实现了壁纸自由(6)
  • Ruby 发送邮件 - SMTP
  • Python爱心代码
  • 【二分查找法及其应用】
  • Android 进阶——Framework核心 之Binder Java成员类详解(三)
  • Maven
  • 1947抓住那头牛(队列 广度优先搜索)
  • 基于linux5.15.5的IMX 参考手册 ---21
  • Android Dalvik虚拟机 堆初始化流程
  • 0讲(补)——开发前必备基本常识
  • JS学习笔记
  • linux005之用户、组管理
  • 列线图工具_Nomogram
  • 【C++】类和对象(一)
  • Python获取搜索引擎结果
  • 2.4.8 PCIe——物理逻辑层——REFCLK
  • 树莓派4B arm64 搭建 docker+drone+gitea
  • Java的JDBC编程
  • CSS:块格式化上下文(BFC)
  • paddle表情识别部署
  • Python-第五天 Python函数
  • 【Python学习笔记】28.Python3 错误和异常
  • SQLServer 迁移到 MySQL 工具对比
  • 分析finebi5.x仪表板组件获取数据过程(数据是数据集或者sql的)
  • 设计模式--适配器模式 Adapter Pattern
  • PVE虚拟机篇-rest api
  • 2022-2025学年面向中小学生的白名单全国性竞赛活动清单及官网地址链接
  • Python 高级编程之生成器与协程进阶(五)