当前位置: 首页 > news >正文

拉链表-spark版本

采用spark实现的拉链表

拉链表初始化

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.lit/*** 拉链表初始化*/
object table_zip_initial {val lastDay = "9999-12-31"def main(args: Array[String]): Unit = {var table_base = "t_uac_organization" //基表var table_zip = "ods_uac_org_zip" //拉链表/*** 基于该天的t_uac_organization*/var dt = "2023-01-31"System.setProperty("HADOOP_USER_NAME", "root")val builder = SparkUtils.getBuilderif (System.getProperties.getProperty("os.name").contains("Windows")) {builder.master("local[*]")} else {table_base = args(0)table_zip = args(1)dt = args(2)}val spark = builder.appName(this.getClass.getName).getOrCreate()val hive_db = "common"spark.sql(s"use $hive_db")/*** 初始化,一次*/if (!TableUtils.tableExists(spark, hive_db, table_zip)) {println(s"$table_zip not exists,初始化")init(dt, spark, hive_db, table_base, table_zip)} else {val t_zip = spark.sql(s"""||select * from $table_zip where dt='$lastDay'||""".stripMargin)if (t_zip.isEmpty) {//initprintln(s"$table_zip isEmpty 初始化")init(dt, spark, hive_db, table_base, table_zip)} else {println(s"$table_zip exist and not empty,无需初始化!!!")}}spark.stop()}private def init(dt: String, spark: SparkSession, hive_db: String, table_base: String, table_zip: String): Unit = {val t_base = spark.sql(s"""||select * from $table_base where dt='${dt}'|""".stripMargin)println(s"$table_base show")t_base.show(false)val ods_zip = t_base.drop("dt").withColumn("t_start", lit(dt)).withColumn("t_end", lit(lastDay)).withColumn("dt", lit(lastDay))if (!ods_zip.isEmpty) {println(s"$table_zip show")ods_zip.show(false)println(s"$table_zip 初始化...")SinkUtil.sink_to_hive(lastDay, spark, ods_zip, hive_db, hive_table = s"$table_zip", "parquet", MySaveMode.OverWriteByDt)} else {println(s"$table_zip is empty,初始化失败...")}}
}

拉链表每日滚动计算

import org.apache.spark.sql.functions.{count, lit}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.storage.StorageLevel/*** 拉链表只能从装载首日起,一天一天滚动计算*/
object ods_uac_org_zip {val lastDay = "9999-12-31"def main(args: Array[String]): Unit = {var dt = "2023-02-01"var dt1 = "2023-02-02"System.setProperty("HADOOP_USER_NAME", "root")val builder = SparkUtils.getBuilderif (System.getProperties.getProperty("os.name").contains("Windows")) {builder.master("local[*]")} else {dt = args(0)dt1 = args(1)}val spark = builder.appName(this.getClass.getName).getOrCreate()val hive_db = "common"spark.sql(s"use $hive_db")new IDate {override def onDate(dt: String): Unit = {processByDt(spark, dt, hive_db)}}.invoke(dt, dt1)spark.stop()}/*** 滚动计算每个dt的对应的过期数据*/def processByDt(spark: SparkSession, dt: String, hive_db: String): Unit = {val theDayBeforeDt = DateUtil.back1Day(dt + " 00:00:00").split(" ")(0)/*** 一定需要先缓存* 否则重算则fileNotFoundException* 因此需要借助临时表处理或者设置ck*/var ods_uac_org_zip = spark.sql(s"""||select * from ods_uac_org_zip where dt='$lastDay'|""".stripMargin).persist(StorageLevel.MEMORY_ONLY_SER_2)/*** 持久化为临时表*/ods_uac_org_zip.repartition(3).write.format("parquet").mode(SaveMode.Overwrite).saveAsTable(s"${hive_db}.ods_uac_org_zip_tmp")/*** 已经指向临时表* 后续方便对源表(ods_uac_org_zip)进行更新*/ods_uac_org_zip = spark.sql(s"""||select * from ods_uac_org_zip_tmp|""".stripMargin)/*** old,已经存在的拉链表的最新全量*/val f_old_9999 = ods_uac_org_zip.drop("dt")println("f_old_9999 show")f_old_9999.show(false)/*** dt该天的新增和变化*/val f_new = spark.sql(s"""||select * from new_change_t_uac_organization where dt='${dt}'|""".stripMargin).drop("dt").withColumnRenamed("id", "id2").withColumnRenamed("org_name", "org_name2").withColumnRenamed("parent_id", "parent_id2").withColumnRenamed("sort", "sort2").withColumnRenamed("org_type", "org_type2").withColumnRenamed("org_level", "org_level2").withColumnRenamed("is_auth_scope", "is_auth_scope2").withColumnRenamed("parent_auth_scope_id", "parent_auth_scope_id2").withColumnRenamed("status", "status2").withColumnRenamed("icon_class", "icon_class2").withColumnRenamed("create_id", "create_id2").withColumnRenamed("create_time", "create_time2").withColumnRenamed("update_id", "update_id2").withColumnRenamed("update_time", "update_time2").withColumnRenamed("version", "version2").withColumn("t_start2", lit(dt)).withColumn("t_end2", lit(lastDay))println("f_new show")f_new.show(false)val f1 = f_old_9999.join(f_new, f_old_9999.col("id") === f_new.col("id2"), "full_outer")f1.createOrReplaceTempView("v1")println("v1 temp show")f1.show(false)f1.filter(s"id='1008'").show(false)/*** 这是所有dt=9999的*/val f_9999: DataFrame = spark.sql("""||select|nvl(id2,id) as id|,nvl(org_name2,org_name) as org_name|,nvl(parent_id2,parent_id) as parent_id|,nvl(sort2,sort) as sort|,nvl(org_type2,org_type) as org_type|,nvl(org_level2,org_level) as org_level|,nvl(is_auth_scope2,is_auth_scope) as is_auth_scope|,nvl(parent_auth_scope_id2,parent_auth_scope_id) as parent_auth_scope_id|,nvl(status2,status) as status|,nvl(icon_class2,icon_class) as icon_class|,nvl(create_id2,create_id) as create_id|,nvl(create_time2,create_time) as create_time|,nvl(update_id2,update_id) as update_id|,nvl(update_time2,update_time) as update_time|,nvl(version2,version) as version|,nvl(t_start2,t_start) as t_start|,nvl(t_end2,t_end) as t_end|,nvl(t_end2,t_end) as dt||from v1|||""".stripMargin)/*** +----+--------------------------------------------+---------+----+--------+---------+-------------+--------------------+------+-------------------------+---------+-------------------+---------+-------------------+-------+----------+----------+----------+* |id  |org_name                                    |parent_id|sort|org_type|org_level|is_auth_scope|parent_auth_scope_id|status|icon_class               |create_id|create_time        |update_id|update_time        |version|t_start   |t_end     |dt        |* +----+--------------------------------------------+---------+----+--------+---------+-------------+--------------------+------+-------------------------+---------+-------------------+---------+-------------------+-------+----------+----------+----------+* |1   |运营系统                                    |0        |0   |4       |1        |N            |null                |1     |iconfont icon-xitong     |655      |2019-05-20 17:58:11|null     |null               |null   |2023-01-31|9999-12-31|9999-12-31|*/println("f_9999 show")f_9999.show(false)println(s"在${dt}的发生状态变化的,新的有效区间[$dt,$lastDay]...")f_9999.filter(s"t_start='$dt'").show()f_9999.groupBy("dt").agg(count("id")).show()/*** 过期的数据* 需要闭合t_end* dt天发现有变化,那么则在dt-1天过期* 过期的数据:上一次的起始时间,必然小于dt(这个条件很重要,否则幂等计算会有问题,会把计算过的历史分区的起始时间给覆盖掉)*/val f_expire = spark.sql(s"""||select|id,|org_name,|parent_id,|sort,|org_type,|org_level,|is_auth_scope,|parent_auth_scope_id,|status,|icon_class,|create_id,|create_time,|update_id,|update_time,|version,|t_start,|cast(date_add('${dt}',-1) as string) as t_end,|cast(date_add('${dt}',-1) as string) as dt||from v1|where id2 is not null and id is not null and t_start<'$dt'||""".stripMargin)println("f_expire show")f_expire.show(false)/*** 没有动态分区,那就分别各自持久化*/if (!f_9999.isEmpty) {SinkUtil.sink_to_hive(lastDay, spark, f_9999, hive_db, hive_table = "ods_uac_org_zip", "parquet", MySaveMode.OverWriteByDt)}if (!f_expire.isEmpty) {SinkUtil.sink_to_hive(theDayBeforeDt, spark, f_expire, hive_db, hive_table = "ods_uac_org_zip", "parquet", MySaveMode.OverWriteByDt)}}
}
http://www.lryc.cn/news/237612.html

相关文章:

  • 【笔记1-2】Qt系列:QkeyEvent 键盘事件 设定快捷键
  • adb突然获取不到华为/荣耀手机。。。
  • layui的layer.confirm获取按钮焦点
  • 【HarmonyOS】鸿蒙应用开发基础认证题目
  • Mocha
  • Java详解I/O
  • 数据处理生产环境_spark获取df列当前日期的前一天日期
  • 第四代智能井盖传感器,实时守护井盖位安全
  • 【前端知识】Node——文件流的读写操作
  • 解决证书加密问题:OpenSSL与urllib3的兼容性与优化
  • #gStore-weekly | gAnswer源码解析 调用NE模块流程
  • vscode 配置 lua
  • vscode设置代码模板
  • 用css实现原生form中radio单选框和input的hover已经focus的样式
  • uniapp:录音权限检查,录音功能
  • Rust开发——切片(slice)类型
  • 如何给shopify motion主题的产品系列添加description
  • 力扣刷题-二叉树-二叉树最小深度
  • 注解方式优雅的实现 Redisson 分布式锁
  • PHP/Laravel通过经纬度计算距离获取附近商家
  • grafana面板介绍
  • 实验三 循环结构程序设计(Python)
  • Flutter笔记:目录与文件存储以及在Flutter中的使用(上)
  • 注意了!申请流量卡时地址一定不要填写学校,不好下卡哦!
  • minio使用shell上传文件
  • LeetCode538. Convert BST to Greater Tree
  • iPaaS和RPA,企业自动化应该如何选择?
  • AI实践与学习1_Milvus向量数据库实践与原理分析
  • 3Dexcite deltgen 2022x 新功能
  • 代码随想录算法训练营第六十天 | LeetCode 84. 柱状图中最大的矩形