项目场景:
归档库中存在大量历史数据,需要将其中敏感数据内容全不清空。
问题描述
由于归档库表很多,且每个表中的数据字段还不一样,希望方便快速的将数据全不修改完。
解决方案:
使用python脚本配置和多线程方式并行处理,参考代码:
import pymysql.cursors
import logging
import time
from concurrent.futures import ThreadPoolExecutor
log_format = "%(asctime)s - %(levelname)s - %(threadName)s - %(message)s"
logging.basicConfig(filename='log.log', level=logging.DEBUG, format=log_format)
# 每次批处理数量
batch_num = 1
# 每表处理总数
count = 2
def update_table_field(table, phoneField, timeField,max_id):
# 连接数据库
connect = pymysql.Connect(
host='lizz.mysql.com',
port=3306,
user='lizz',
passwd='test123',
db='testpy',
charset='utf8'
)
# 获取游标
cursor = connect.cursor()
sum = 0
try:
while sum < count:
# 执行查询sql,.format替换{}
selectSql = "SELECT id FROM {} where id>%s ORDER BY id LIMIT %s;".format(table)
cursor.execute(selectSql, (max_id, batch_num,))
results = cursor.fetchall()
if not results:
break
# 取最大的id
sum = sum + len(results)
max_id = max(results)[0]
# 提取所有 id
ids = [str(row[0]) for row in results]
# 执行update接口,%s字符串占位符
updateSql = "UPDATE {} SET {} = null,{}={} WHERE id IN ({})" \
.format(table, phoneField, timeField, timeField, ','.join('%s' for _ in ids))
# 执行sql语句
cursor.execute(updateSql, ids)
# 提交到数据库执行
cursor.connection.commit()
logging.info(f"{table} {sum},maxid={max_id}")
# 暂停 1秒,防止过快数据库扛不住
time.sleep(1)
logging.info(f"finish {table} {sum},maxid={max_id}")
except Exception as e:
logging.error(f"Exception {table},{sum},{max_id},{str(e)}")
# 关闭数据库连接
connect.close()
def baseUpdatedata():
table_field_pairs = [
('table1', 'name', 'createtime',55),
('table2', 'age', 'updatetime',100)
]
executor = ThreadPoolExecutor(max_workers=2)
for table, phoneField, timeField ,max_id in table_field_pairs:
executor.submit(update_table_field, table, phoneField, timeField,max_id)
# 关闭线程池
executor.shutdown()
logging.info("all finish!")
# 主函数执行
if __name__ == '__main__':
baseUpdatedata()