目录
1.编程题
1.1Santa的接待安排
圣诞节前100天,Santa开放了workshop,欢迎以家庭单位的参观,如何更合理的安排这些家庭参观?
每个家庭有10个选择choice0-9,数字代表了距离圣诞节的天数,比如 1代表12月24日,每个家庭必须并且只安排一次参观。家庭数量 5000,即family_id 为[0, 4999],每天访问的人数需要在125-300人。
任务:Santa的接待安排
为了更合理的计算Santa的安排能力,我们使用preference cost和accounting penalty两个指标
- 1)preference cost,代表Santa的个性化安排能力
- 2)accounting penalty,代表Santa安排的财务成本
最终的 Score = preference cost + accounting penalty
目标:
- 使用LP线性规划工具完成代码
- 使用整数规划完成代码
- 计算的总score在8万以内
1.1.1读取数据
import pandas as pd
df = pd.read_csv('./family_data.csv', index_col = 'family_id')
df.head()
1.1.2创建pcost_mat
import numpy as np
# n代表家庭成员个数,如果满足第choice个需求,需要的penalty(惩罚)
def get_penalty(n, choice):
penalty = None
if choice == 0:
penalty = 0
if choice == 1:
penalty = 50
if choice == 2:
penalty = 50 + 9 * n
if choice == 3:
penalty = 100 + 9 * n
if choice == 4:
penalty = 200 + 9 * n
if choice == 5:
penalty = 200 + 18 * n
if choice == 6:
penalty = 300 + 18 * n
if choice == 7:
penalty = 300 + 36 * n
if choice == 8:
penalty = 400 + 36 * n
if choice == 9:
penalty = 500 + (36 + 199) * n
if choice > 9:
penalty = 500 + (36 + 398) * n
return penalty
N_DAYS = 100 # 安排的天数
N_FAMILY = 5000 #家庭ID的个数
MIN_OCCUPANCY = 125 # 最小承载量
MAX_OCCUPANCY = 300 # 最大承载量
# 计算pcost_mat,每个家庭在什么时候(day0-99)访问时的penalty
# 大小5000*100的矩阵
pcost_mat = np.full(shape=(N_FAMILY,100),fill_value=999999)
for f in range(N_FAMILY):
# 家庭成员数
f_num = df.loc[f,'n_people']
# 对于第f个家庭,初始化pcost = other choice下的penalty
pcost_mat[f,:] = get_penalty(f_num,10) #初始值最大惩罚
# 计算choice 0-9 的penalty
for choice in range(10):# choice 0-9
temp = df.loc[f][choice] #choice的天数
penalty = get_penalty(f_num,choice) # 得到对应choice的惩罚
pcost_mat[f,temp-1] = penalty # 因为下标是从0开始,所以要在天数基础上-1才是下标值
print(pcost_mat)
print(pcost_mat.shape) # (5000, 100)
1.1.3计算accounting penalty矩阵
# 计算accounting penalty矩阵,前一天的参观人数和当天的参观人数
acost_mat = np.zeros(shape=(MAX_OCCUPANCY+1,MAX_OCCUPANCY+1),dtype=np.float64)
for i in range(acost_mat.shape[0]):# 当天安排的人数
for j in range(acost_mat.shape[1]):# 前一天安排的人数
diff = abs(i-j)
acost_mat[i,j] = max(0,(i - 125) / 400 * i ** (0.5 + diff/50.0))
print(acost_mat)
# 计算每个家庭成员的数量
FAMILY_SIZE = df['n_people'].values
# DESIRED代表每个家庭的 choices-1 => 转化为坐标
DESIRED = df.values[:, :-1]-1
print(FAMILY_SIZE) # array([4, 4, 3, ..., 6, 5, 4])
print(len(DESIRED));print(DESIRED)
"""
5000
array([[51, 37, 11, ..., 75, 9, 27],
[25, 3, 81, ..., 5, 65, 60],
[99, 53, 24, ..., 88, 79, 32],
...,
[31, 65, 53, ..., 80, 2, 6],
[66, 91, 3, ..., 11, 25, 69],
[12, 10, 24, ..., 38, 17, 46]])
"""
1.1.4线性规划求解
from ortools.linear_solver import pywraplp
# 线性规划
def solveLP():
solver= pywraplp.Solver('AssignmentProblem', pywraplp.Solver.GLOP_LINEAR_PROGRAMMING)
x = {} #family_id在第j天是否参观
candidates = [[] for x in range(N_DAYS)] # 定义了len为100的list
for i in range(N_FAMILY):#family_id
for j in DESIRED[i,:]:# family_id的choice
candidates[j].append(i) # 在第j天,有第i个family参观
x[i,j] = solver.BoolVar('x[%i,%i]' %(i,j)) # x[%i,%i]中的i代表integer类型
daily_occupancy = [solver.Sum([x[i,j] * FAMILY_SIZE[i] for i in candidates[j]])\
for j in range(N_DAYS)] # j代表1-100天
family_presence = [solver.Sum(x[i,j] for j in DESIRED[i,:])\
for i in range(N_FAMILY)]
preference_cost = solver.Sum([pcost_mat[i,j] * x[i,j] for i in range(N_FAMILY)\
for j in DESIRED[i,:]])
# 满足preference_cost最小
solver.Minimize(preference_cost)
# 人为增加约束条件为了更加拟合 Constraints
for j in range(N_DAYS-1):
# 当前人数不超过前一天人数+25
solver.Add(daily_occupancy[j] - daily_occupancy[j+1]<=25)
solver.Add(daily_occupancy[j+1] - daily_occupancy[j]<=25)
# 每个家庭都在10个choice中出现一次
for i in range(N_FAMILY):
solver.Add(family_presence[i]==1)
# 每天访问人数约束
for j in range(N_DAYS):
solver.Add(daily_occupancy[j]>=MIN_OCCUPANCY)
solver.Add(daily_occupancy[j]<=MAX_OCCUPANCY)
result = solver.Solve()
temp = [(i,j,x[i,j].solution_value()) for i in range(N_FAMILY)\
for j in DESIRED[i,:] if x[i,j].solution_value()>0]
# 得到参观日期安排
df = pd.DataFrame(temp,columns=['family_id','day','result'])
return df
result = solveLP()
print(result)
1.1.4.1设置阈值
# 设置阈值
THRS = 0.999
assigned_df = result[result.result > THRS]
print(assigned_df)
1.1.4.2找出没有安排的家庭
# 没有安排好的,即不为0和1
unassigned_df = result[(result.result < THRS) & (result.result > 1-THRS)]
print(unassigned_df)
1.1.4.3计算每天访问的人数
# 计算每天访问的人数(根据assigned_df)
assigned_df['family_size'] = FAMILY_SIZE[assigned_df.family_id]
print(assigned_df)
1.1.4.4按照day进行聚合
# 按照day进行聚合
occupancy = assigned_df.groupby('day').family_size.sum().values
print(occupancy)
1.1.4.5创建min_occupancy和max_occupancy
min_occupancy = np.array([max(0,MIN_OCCUPANCY - x) for x in occupancy])
print(min_occupancy)
max_occupancy = np.array([MAX_OCCUPANCY - x for x in occupancy])
print(max_occupancy)
1.1.5整数规划求解
#进行 使用整数规划求解
def solveIP(families,min_occupancy,max_occupancy):
# 创建求解器
solver = pywraplp.Solver('AssignmentProblem', pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING)
# 需要安排的家庭
n_families = len(families)
x = {} # family_id在第j天是否参观
candidates = [[] for x in range(N_DAYS)] # 定义了len为100的list
for i in families:#family_id
for j in DESIRED[i,:]:# family_id的choice
#print(j)
candidates[j].append(i) # 在第j天,有第i个family参观
x[i,j] = solver.BoolVar('x[%i,%i]' %(i,j)) # x[%i,%i]中的i代表integer类型
daily_occupancy = [solver.Sum([x[i,j] * FAMILY_SIZE[i] for i in candidates[j]])\
for j in range(N_DAYS)] # j代表1-100天
family_presence = [solver.Sum(x[i,j] for j in DESIRED[i,:])\
for i in families]
preference_cost = solver.Sum([pcost_mat[i,j] * x[i,j] for i in families\
for j in DESIRED[i,:]])
# 满足preference_cost最小
solver.Minimize(preference_cost)
# 每个家庭都在10个choice中出现一次
for i in range(n_families):
solver.Add(family_presence[i]==1)
# 每天访问人数约束
for j in range(N_DAYS):
solver.Add(daily_occupancy[j]>=min_occupancy[j])
solver.Add(daily_occupancy[j]<=max_occupancy[j])
result = solver.Solve()
temp = [(i,j) for i in families\
for j in DESIRED[i,:] if x[i,j].solution_value()>0]
# 计算剩余家庭的安排
df = pd.DataFrame(temp,columns=['family_id','day'])
return df
# 没有安排的family_id
unassigned = unassigned_df.family_id.unique()
result = solveIP(unassigned,min_occupancy,max_occupancy)
print(unassigned)
"""
array([ 59, 240, 262, 476, 488, 610, 630, 724, 926, 941, 961,
1012, 1106, 1167, 1176, 1218, 1235, 1271, 1387, 1444, 1484, 1536,
1610, 1611, 1653, 1720, 1791, 1977, 2004, 2101, 2135, 2195, 2229,
2318, 2344, 2395, 2641, 2778, 2862, 2874, 3033, 3104, 3118, 3228,
3237, 3278, 3280, 3348, 3470, 3504, 3642, 3649, 3786, 4045, 4077,
4084, 4404, 4421, 4431, 4471, 4518, 4619, 4661, 4761, 4820, 4850,
4886, 4912, 4914, 4961])
"""
1.1.5.1合并数据
# 合并数据
df_concat = pd.concat((assigned_df[['family_id','day']],result)).sort_values('family_id')
print(df_concat)
1.1.5.2使用numba加速
from numba import njit
@njit(fastmath=True)
def pcost(prediction):
daily_occupancy = np.zeros(N_DAYS+1,dtype=np.int64)
penalty = 0
for (i,p) in enumerate(prediction):
n = FAMILY_SIZE[i]
penalty += pcost_mat[i,p]
daily_occupancy[p] += n
return penalty,daily_occupancy
@njit(fastmath=True)
def acost(daily_occupancy):
accounting_cost = 0
num_out_of_range = 0
for day in range(N_DAYS):
n_p1 = daily_occupancy[day+1] # 前一天
n = daily_occupancy[day] # 当天
num_out_of_range += (n>MAX_OCCUPANCY) or (n<MIN_OCCUPANCY)
accounting_cost += acost_mat[n,n_p1]
return accounting_cost,num_out_of_range
@njit(fastmath=True)
def cost_function(prediction):
penalty,daily_occupancy = pcost(prediction) #统计preference cost和每天承载数量
accounting_cost,num_out_of_range = acost(daily_occupancy) # 根据每天承载数量计算accounting cost
final_score = penalty + accounting_cost + num_out_of_range * 9999999
return final_score
prediction = df_concat.day.values
print(cost_function(prediction)) # 109138.55183535561
1.1.6保存submission.csv
def save_result(pred,filename):
result = pd.DataFrame(range(N_FAMILY),columns=['family_id'])
result['assigned_day'] = pred + 1
result.to_csv(filename,index=False)
print(filename + ' saved')
return result
result = save_result(prediction,'submission.csv')
1.1.7寻找更好的替代方案
# 寻找更好的替代方案
def find_better(pred):
fobs = np.argsort(FAMILY_SIZE) # 返回数组从小到大的索引
#print(np.sort(FAMILY_SIZE)) # 对FAMILY_SIZE按从小到大的顺序排序
score = cost_function(pred)
original_score = np.inf #打擂法 正无穷初始值
# 如果找不到更新则退出
while score < original_score:
original_score = score
for family_id in fobs:
for pick in range(10):
# 得到family_id在choice pick的日期day
day = DESIRED[family_id, pick]
# 该family的原有日期oldvalue
oldvalue = pred[family_id]
# 将原有oldvalue替换为现在的choice pick
pred[family_id] = day
# 重新计算分数
new_score = cost_function(pred)
# 如果比原来分数小,更新choice成功
if new_score < score:
score = new_score
else:# 设置为原来的oldvalue
pred[family_id] = oldvalue
print(score,end='\r')
print(score)
print(find_better(prediction)) # -3.041118109153846e+216
import numpy as np
import pandas as pd
from numba import njit
from ortools.linear_solver import pywraplp
df = pd.read_csv('./family_data.csv', index_col = 'family_id')
# n代表家庭成员个数,如果满足第choice个需求,需要的penalty(惩罚)
def get_penalty(n, choice):
penalty = None
if choice == 0:
penalty = 0
if choice == 1:
penalty = 50
if choice == 2:
penalty = 50 + 9 * n
if choice == 3:
penalty = 100 + 9 * n
if choice == 4:
penalty = 200 + 9 * n
if choice == 5:
penalty = 200 + 18 * n
if choice == 6:
penalty = 300 + 18 * n
if choice == 7:
penalty = 300 + 36 * n
if choice == 8:
penalty = 400 + 36 * n
if choice == 9:
penalty = 500 + (36 + 199) * n
if choice > 9:
penalty = 500 + (36 + 398) * n
return penalty
N_DAYS = 100 # 安排的天数
N_FAMILY = 5000 #家庭ID的个数
MIN_OCCUPANCY = 125 # 最小承载量
MAX_OCCUPANCY = 300 # 最大承载量
# 计算pcost_mat,每个家庭在什么时候(day0-99)访问时的penalty
# 大小5000*100的矩阵
pcost_mat = np.full(shape=(N_FAMILY,100),fill_value=999999)
for f in range(N_FAMILY):
# 家庭成员数
f_num = df.loc[f,'n_people']
# 对于第f个家庭,初始化pcost = other choice下的penalty
pcost_mat[f,:] = get_penalty(f_num,10) #初始值最大惩罚
# 计算choice 0-9 的penalty
for choice in range(10):# choice 0-9
temp = df.loc[f][choice] #choice的天数
penalty = get_penalty(f_num,choice) # 得到对应choice的惩罚
pcost_mat[f,temp-1] = penalty # 因为下标是从0开始,所以要在天数基础上-1才是下标值
# 计算accounting penalty矩阵,前一天的参观人数和当天的参观人数
acost_mat = np.zeros(shape=(MAX_OCCUPANCY+1,MAX_OCCUPANCY+1),dtype=np.float64)
for i in range(acost_mat.shape[0]):# 当天安排的人数
for j in range(acost_mat.shape[1]):# 前一天安排的人数
diff = abs(i-j)
acost_mat[i,j] = max(0,(i - 125) / 400 * i ** (0.5 + diff/50.0))
# 计算每个家庭成员的数量
FAMILY_SIZE = df['n_people'].values
# DESIRED代表每个家庭的 choices-1 => 转化为坐标
DESIRED = df.values[:, :-1]-1
print(len(DESIRED))
# 线性规划
def solveLP():
solver= pywraplp.Solver('AssignmentProblem', pywraplp.Solver.GLOP_LINEAR_PROGRAMMING)
x = {} #family_id在第j天是否参观
candidates = [[] for x in range(N_DAYS)] # 定义了len为100的list
for i in range(N_FAMILY):#family_id
for j in DESIRED[i,:]:# family_id的choice
candidates[j].append(i) # 在第j天,有第i个family参观
x[i,j] = solver.BoolVar('x[%i,%i]' %(i,j)) # x[%i,%i]中的i代表integer类型
daily_occupancy = [solver.Sum([x[i,j] * FAMILY_SIZE[i] for i in candidates[j]])\
for j in range(N_DAYS)] # j代表1-100天
family_presence = [solver.Sum(x[i,j] for j in DESIRED[i,:])\
for i in range(N_FAMILY)]
preference_cost = solver.Sum([pcost_mat[i,j] * x[i,j] for i in range(N_FAMILY)\
for j in DESIRED[i,:]])
# 满足preference_cost最小
solver.Minimize(preference_cost)
# 人为增加约束条件为了更加拟合 Constraints
for j in range(N_DAYS-1):
# 当前人数不超过前一天人数+25
solver.Add(daily_occupancy[j] - daily_occupancy[j+1]<=25)
solver.Add(daily_occupancy[j+1] - daily_occupancy[j]<=25)
# 每个家庭都在10个choice中出现一次
for i in range(N_FAMILY):
solver.Add(family_presence[i]==1)
# 每天访问人数约束
for j in range(N_DAYS):
solver.Add(daily_occupancy[j]>=MIN_OCCUPANCY)
solver.Add(daily_occupancy[j]<=MAX_OCCUPANCY)
result = solver.Solve()
temp = [(i,j,x[i,j].solution_value()) for i in range(N_FAMILY)\
for j in DESIRED[i,:] if x[i,j].solution_value()>0]
# 得到参观日期安排
df = pd.DataFrame(temp,columns=['family_id','day','result'])
return df
result_solveLP = solveLP() # 线性规划求解
print(result_solveLP)
# 设置阈值
THRS = 0.999
assigned_df = result[result.result > THRS]
# 没有安排好的,即不为0和1
unassigned_df = result[(result.result < THRS) & (result.result > 1-THRS)]
# 计算每天访问的人数(根据assigned_df)
assigned_df['family_size'] = FAMILY_SIZE[assigned_df.family_id]
# 按照day进行聚合
occupancy = assigned_df.groupby('day').family_size.sum().values
min_occupancy = np.array([max(0,MIN_OCCUPANCY - x) for x in occupancy])
max_occupancy = np.array([MAX_OCCUPANCY - x for x in occupancy])
#进行 使用整数规划求解
def solveIP(families,min_occupancy,max_occupancy):
# 创建求解器
solver = pywraplp.Solver('AssignmentProblem', pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING)
# 需要安排的家庭
n_families = len(families)
x = {} # family_id在第j天是否参观
candidates = [[] for x in range(N_DAYS)] # 定义了len为100的list
for i in families:#family_id
for j in DESIRED[i,:]:# family_id的choice
#print(j)
candidates[j].append(i) # 在第j天,有第i个family参观
x[i,j] = solver.BoolVar('x[%i,%i]' %(i,j)) # x[%i,%i]中的i代表integer类型
daily_occupancy = [solver.Sum([x[i,j] * FAMILY_SIZE[i] for i in candidates[j]])\
for j in range(N_DAYS)] # j代表1-100天
family_presence = [solver.Sum(x[i,j] for j in DESIRED[i,:])\
for i in families]
preference_cost = solver.Sum([pcost_mat[i,j] * x[i,j] for i in families\
for j in DESIRED[i,:]])
# 满足preference_cost最小
solver.Minimize(preference_cost)
# 每个家庭都在10个choice中出现一次
for i in range(n_families):
solver.Add(family_presence[i]==1)
# 每天访问人数约束
for j in range(N_DAYS):
solver.Add(daily_occupancy[j]>=min_occupancy[j])
solver.Add(daily_occupancy[j]<=max_occupancy[j])
result = solver.Solve()
temp = [(i,j) for i in families\
for j in DESIRED[i,:] if x[i,j].solution_value()>0]
# 计算剩余家庭的安排
df = pd.DataFrame(temp,columns=['family_id','day'])
return df
# 没有安排的family_id
unassigned = unassigned_df.family_id.unique()
result_solveIP = solveIP(unassigned,min_occupancy,max_occupancy) # 整数规划求解
print(result_solveIP)
# 合并数据
df_concat = pd.concat((assigned_df[['family_id','day']],result)).sort_values('family_id')
@njit(fastmath=True)
def pcost(prediction):
daily_occupancy = np.zeros(N_DAYS+1,dtype=np.int64)
penalty = 0
for (i,p) in enumerate(prediction):
n = FAMILY_SIZE[i]
penalty += pcost_mat[i,p]
daily_occupancy[p] += n
return penalty,daily_occupancy
@njit(fastmath=True)
def acost(daily_occupancy):
accounting_cost = 0
num_out_of_range = 0
for day in range(N_DAYS):
n_p1 = daily_occupancy[day+1] # 前一天
n = daily_occupancy[day] # 当天
num_out_of_range += (n>MAX_OCCUPANCY) or (n<MIN_OCCUPANCY)
accounting_cost += acost_mat[n,n_p1]
return accounting_cost,num_out_of_range
@njit(fastmath=True)
def cost_function(prediction):
penalty,daily_occupancy = pcost(prediction) #统计preference cost和每天承载数量
accounting_cost,num_out_of_range = acost(daily_occupancy) # 根据每天承载数量计算accounting cost
final_score = penalty + accounting_cost + num_out_of_range * 9999999
return final_score
prediction = df_concat.day.values
print(cost_function(prediction)) # 109138.55183535561
# 寻找更好的替代方案
def find_better(pred):
fobs = np.argsort(FAMILY_SIZE) # 返回数组从小到大的索引
#print(np.sort(FAMILY_SIZE)) # 对FAMILY_SIZE按从小到大的顺序排序
score = cost_function(pred)
original_score = np.inf #打擂法 正无穷初始值
# 如果找不到更新则退出
while score < original_score:
original_score = score
for family_id in fobs:
for pick in range(10):
# 得到family_id在choice pick的日期day
day = DESIRED[family_id, pick]
# 该family的原有日期oldvalue
oldvalue = pred[family_id]
# 将原有oldvalue替换为现在的choice pick
pred[family_id] = day
# 重新计算分数
new_score = cost_function(pred)
# 如果比原来分数小,更新choice成功
if new_score < score:
score = new_score
else:# 设置为原来的oldvalue
pred[family_id] = oldvalue
print(score,end='\r')
print(score)
print(find_better(prediction)) # -3.041118109153846e+216