最近老大要求写个定时执行的调度程序,来保证写好的 json 文件能在有变量的情况下,可以把每小时的任务按照增量抽取的方法保存到 hdfs,而且需要把 json 运行的日志按照天为单位保存在本地目录下,赶忙找来小老弟把这个程序写了一下。
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import threading
import time
import os
import commands
import json
import codecs
import thread
# 获取当前时间
now = time.strftime('%Y.%m.%d', time.localtime(time.time()))
# shell脚本需要的时间
dataTime = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
# 目录名
dataDir = time.strftime('%Y%m%d', time.localtime(time.time()))
exitFlag = 0
# 打开文件
fo = open("/home/data/datax/projobs/time.txt", 'r')
# fo = open("time.txt", 'r')
# 获取上一次保存的时间
last_time = fo.read()
fo.close()
# print last_time
class myThread(threading.Thread): # 继承父类threading.Thread
def __init__(self, threadID, name, counter, last_time, now, filetype, fileName):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
self.last_time = last_time
self.filetype = filetype
self.now = now
self.fileName = fileName
def run(self): # 把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
# print "Starting " + self.name
# print_time(self.name, self.counter, 5)
# print "Exiting " + self.name
do_datax_json(self.fileName, self.last_time, self.now, self.filetype)
def do_datax_json(fileName, last_time, now, filetype):
if len(last_time) <= 0:
last_time = '1970.01.01 00:00:00'
f_name = fileName[:-5]
# print f_name[0-3]
if f_name[0:4] == 'dim_':
f_name = f_name[4:]
print f_name
if (f_name == 't_school_data') or (f_name == 't_school_data_edb_rel'):
string = "python /home/data/datax/bin/datax.py /home/data/datax/projobs/" + fileName + " -p '-Dsource=jzproduct'"
elif f_name == 'wx_channel':
string = "python /home/data/datax/bin/datax.py /home/data/datax/projobs/" + fileName + " -p '-DfileName=\"weixin-accoun__" + dataTime + "\" -DlastTime=\"" + last_time + "\" -DnowTime=\"" + now + "\"'"
elif f_name == 'fact_user_role':
string = "python /home/data/datax/bin/datax.py /home/data/datax/projobs/" + fileName + " -p '-DfileName=\"t_fact_user_role__" + dataTime + "\" -DlastTime=\"" + last_time + "\" -DnowTime=\"" + now + "\"'"
else :
string = "python /home/data/datax/bin/datax.py /home/data/datax/projobs/" + fileName + " -p '-DfileName=\"" + f_name + "__" + dataTime + "\" -DlastTime=\"" + last_time + "\" -DnowTime=\"" + now + "\"'"
print string
retcode, ret = commands.getstatusoutput(string)
# print retcode
#print ret
# result = os.popen(string, 'r')
final_name = fileName.replace(filetype, '')
# foress = open("/home/data/datax/log/" + final_name + ".txt", 'a+')
foress = open(mkpath + final_name + ".txt", 'a+')
foress.write("脚本执行时间:" + now + "")
foress.write("脚本执行命令:" + string + "")
foress.write(ret)
foress.close()
def mkdir(path):
# 引入模块
import os
# 去除首位空格
path=path.strip()
# 去除尾部 \ 符号
path=path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists(path)
print isExists
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
print path+' 创建成功'
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print path+' 目录已存在'
return False
# 定义要创建的目录
# mkpath="/home/vagrant/python/datax3/datax/sparkLog/20190415/"
mkpath="/home/data/datax/projobs/log/"+dataDir+"/"
# 调用函数
mkdir(mkpath)
# do_datax_json()
max_number = 0
set_max = 7
threads = []
path = '/home/data/datax/projobs' # 指定文件所在路径
# path = '/home/vagrant/python/datax3/datax/jobs' # 指定文件所在路径
filetype = '.json' # 指定文件类型
for root, dirs, files in os.walk(path):
for i in files:
if filetype in i:
# print i
max_number += 1
# do_datax_json(i,last_time,now,filetype)
if set_max >= max_number:
# print max_number
# myThread(1, "Thread-1", 1, last_time, now, filetype, i).start()
# 创建新线程
t1 = 'thread' + str(max_number)
t2 = 'thread-' + str(max_number)
t1 = myThread(1, t2, 1, last_time, now, filetype, i)
t1.start()
threads.append(t1)
# 等待所有线程完成
for t in threads:
t.join()
# #更新当前时间
# foo = open("/home/vagrant/python/datax3/datax/bin/time.txt", 'w')
foo = open("/home/data/datax/projobs/time.txt", 'w')
foo.write(now)
foo.close()
print "Exiting Main Thread"
没想到还给我埋了个坑,设置的可以调度 7 个json 文件,我前一天写了十个,第二天发现传输流程报错,错误日志显示 hdfs 里面没有文件,仔细一检查才知道是这个最大值的错误,害我找了半天。