oracle表增量同步到hive分区表

最新推荐文章于 2024-05-15 17:10:30 发布

逮皮皮虾户

最新推荐文章于 2024-05-15 17:10:30 发布

阅读量895

点赞数 1

CC 4.0 BY-SA版权

本文链接：https://blog.csdn.net/zxt880610/article/details/107507027

本文介绍了一种使用Shell脚本实现从Oracle数据库到Hive数据仓库的数据迁移方法。脚本接收服务器IP、数据库名、表名等参数，通过sqoop工具导入数据，再利用Hive SQL进行数据转换和分区，最后触发Impala刷新，确保数据一致性。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

本文以shell脚本，通过传参的的形式，将服务器ip,oracle的库名表名以及作为分区字段的字段名称，hive的库名，表名作为参数传入，这样可以做到灵活变更，提高通用性与方便性，通过定时器调度此脚本即可。

脚本包含三步：

一：通过sqoop将oracle数据导入到hive临时表，临时表需创建，无分区表

二：将hive临时表数据insert到hive正式表，以传入的分区字段作为分区，此脚本中分区有年xxxx，月xxxx-xx，日xxxx-xx-xx

三：因为我有用到impala，所以在第三步加上了impala刷新操作，如不刷新，impala将识别不到新增hive数据

#!/bin/bash
#

# import table from oracle into hive

nargs=$#
echo "argument num: $nargs "

today=`date +%Y-%m-%d`
one_day=`date +%Y-%m-%d -d'-1 day'`
coll_db=''
coll_tab=''
coll_host_ip=''
coll_host_port=1521
coll_tab_username=''
coll_tab_passwd=''

hive_db=''
hive_tab=''
#hive_tab_cols=''
hive_map_cols=''

start_dt=''
end_dt=''

pt_col=''

/usr/bin/kinit -kt /opt/yarn.keytab yarn

# argument parse
argParse()
{
echo "argument num: $nargs "

   for ag in $@
   do
   #   echo $ag
       arg_key=${ag%=*}
       arg_val=${ag#*=}
#       echo "${ag%=*}--- ${ag#*=}"
#       echo "$arg_key---- $arg_val"
       case ${arg_key} in
           "coll_host_ip")    coll_host_ip=$arg_val   ;;
           "coll_db")        coll_db=$arg_val   ;;
           "coll_tab")        coll_tab=$arg_val   ;;
           "hive_db")        hive_db=$arg_val   ;;
           "hive_tab")        hive_tab=$arg_val   ;;
           #"hive_tab_cols")    hive_tab_cols=$arg_val   ;;
           "hive_map_cols")    hive_map_cols=$arg_val   ;;
           "start_dt")        start_dt=$arg_val   ;;
           "end_dt")        end_dt=$arg_val   ;;
           "pt_col")        pt_col=$arg_val   ;;
       esac
   done
}

# parse the arguments key:value paire
argParse $@
# pring argument
printArgs()
{
   echo "coll_host_ip:$coll_host_ip"
   echo "coll_db:$coll_db"
   echo "coll_tab:$coll_tab"
   echo "hive_db:$hive_db"
   echo "hive_tab:$hive_tab"
   #echo "hive_tab_cols:$hive_tab_cols"
   echo "hive_map_cols:$hive_map_cols"
   echo "start_dt:$start_dt"
   echo "end_dt:$end_dt"
   echo "pt_col:$pt_col"
echo "one_day:$one_day"
echo "today:$today"
}

# print the value of argument
printArgs

# sqoop import table of mysql to hive tmp table
sqoopImpTempTab()
{
   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ [`date +\"%F %T\"`] sqoop import start @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ "
if [ -n "${hive_map_cols}" ]
then
       sqoop import -Dorg.apache.sqoop.splitter.allow_text_splitter=true -Dmapreduce.job.queuename=bf_yarn_pool.production \
--connect jdbc:oracle:thin:@$coll_host_ip:$coll_host_port/$coll_db \
--table $coll_tab --username $coll_tab_username --password $coll_tab_passwd \
--delete-target-dir \
       --hive-import --hive-overwrite --hive-database tmp_${hive_db} --hive-table tmp_${hive_tab} --hive-drop-import-delims -m 1 \
       --where "${pt_col} >=to_date('${one_day}','yyyy-mm-dd') and ${pt_col}<to_date('${today}','yyyy-mm-dd')" --fields-terminated-by '\001' \
           --split-by ${hive_map_cols} \
       --null-string '\\N' --null-non-string '\\N'
else
       sqoop import -Dorg.apache.sqoop.splitter.allow_text_splitter=true -Dmapreduce.job.queuename=bf_yarn_pool.production \
--connect jdbc:oracle:thin:@$coll_host_ip:$coll_host_port/$coll_db \
   --table $coll_tab --username $coll_tab_username --password $coll_tab_passwd \
           --delete-target-dir \
--hive-import --hive-overwrite --hive-database tmp_${hive_db} --hive-table tmp_${hive_tab} --hive-drop-import-delims -m 1 \
--where "${pt_col} >=to_date('${one_day}','yyyy-mm-dd') and ${pt_col}<to_date('${today}','yyyy-mm-dd')" --fields-terminated-by '\001' \
--null-string '\\N' --null-non-string '\\N' #&> /dev/null

fi
RET=$?
if [ $RET -eq 0 ]; then
echo "`date +\"%F %T\"` [INFO] sqoop import database:$hive_db table:temp_$hive_tab successfully."
else
echo "`date +\"%F %T\"` [ERROR] sqoop import database:$hive_db table:temp_$hive_tab error."
exit 5
fi
echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ [`date +\"%F %T\"`] sqoop import end @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ "

}
sqoopImpTempTab

# store tmp table to hive rcfile table
hiveStoreAsRc()
{

#pt_col=create_time
echo '---------------------------------------------------------- hive store as rcfile start -------------------------------------------------------------------------------------'

   hsql="use ${hive_db};set hive.exec.dynamic.partition=true;set hive.exec.dynamic.partition.mode=nonstrict;
set mapreduce.map.memory.mb=15000;set mapreduce.reduce.memory.mb=15000; set hive.merge.mapredfiles=true;set hive.exec.max.created.files=100000;
SET hive.exec.max.dynamic.partitions=100000;SET hive.exec.max.dynamic.partitions.pernode=100000;from tmp_${hive_db}.tmp_${hive_tab} \
       INSERT OVERWRITE TABLE ${hive_db}.${hive_tab} PARTITION(pk_year,pk_month,pk_day) select *,substr(${pt_col}, 1, 4),substr(${pt_col}, 1, 7),substr(${pt_col}, 1, 10) \
       where ${pt_col} >='${one_day}' and ${pt_col}<'${today}' "

   #phsql=$hsql" INSERT INTO TABLE ${hive_db}.${hive_tab} PARTITION(partition_key='${sdt:0:7}') select $hive_tab_cols where $pt_col > '${sdt:0:7}';"
   echo "#################### hsql: $hsql"
   #hive -S -e "${hsql}"
   beeline --hiveconf mapreduce.job.queuename=bf_yarn_pool.production --silent=true --showHeader=false --showWarnings=false -u 'jdbc:hive2://localhost:10000/default;' -n yarn -p ******* -e "${hsql}"

   RET=$?
   if [ $RET -eq 0 ]; then
   echo "`date +\"%F %T\"` [INFO] ${hive_db}.${hive_tab} store successfully."
       #exit 0
   else
        echo "`date +\"%F %T\"` [ERROR] ${hive_db}.${hive_tab} store failure."
        exit 5
   fi
echo '---------------------------------------------------------- hive store as rcfile end -------------------------------------------------------------------------------------'
}

hiveStoreAsRc

# impala table refresh
impTabRefrsh()
{
echo '********************************************************** impala refresh table start *****************************************************************************************'
beeline --silent=true --showHeader=false --showWarnings=false -u 'jdbc:hive2://localhost:21050/default;' -n yarn -p ******* -e "refresh ${hive_db}.${hive_tab}"

RET=$?

   if [ ${RET} -eq 0 ]; then
       echo "`date +\"%F %T\"` [INFO] impala:refresh ${hive_db}.${hive_tab} success!"
       exit 0
   else
       echo "`date +\"%F %T\"` [ERROR] impala:refresh ${hive_db}.${hive_tab} failure!"
       exit 5
   fi
   echo '********************************************************** impala refresh table end *****************************************************************************************'
}

impTabRefrsh