高通量计算设计：用Python批量生成和提交VASP计算任务

九章云极AladdinEdu

于 2025-09-12 17:13:26 发布

阅读量876

点赞数 8

CC 4.0 BY-SA版权

文章标签： python 开发语言 transformer gpu算力人工智能 unity

本文链接：https://blog.csdn.net/AladdinEdu/article/details/151622902

点击 “AladdinEdu，同学们用得起的【H卡】算力平台”，注册即送-H卡级别算力，80G大显存，按量计费，灵活弹性，顶级配置，学生更享专属优惠。

摘要

在计算材料科学研究中，VASP（Vienna Ab initio Simulation Package）是最常用的第一性原理计算软件之一。然而，手动准备输入文件、提交任务和监控计算过程不仅耗时耗力，还容易出错。本文详细介绍如何使用Python脚本自动化完成VASP计算的整个流程，包括结构生成、输入文件准备、任务批量提交、计算状态监控和结果自动分析。通过实现从"手工操作"到"批量生产"的转变，研究人员可以将精力集中在科学问题本身，大幅提升研究效率。

1. 引言：高通量计算的价值与挑战

1.1 为什么需要高通量计算？

材料计算研究通常涉及大量相似的计算任务：

材料筛选：测试多种化学成分或晶体结构
参数优化：研究不同计算参数对结果的影响
相图构建：计算大量状态点的性质
性质扫描：研究应变、电场等外场效应

手动处理这些任务几乎不可行，高通量自动化成为必然选择。

1.2 传统方法的局限性

传统手动操作方式存在明显问题：

时间消耗：准备一个任务的输入文件可能需要10-30分钟
人为错误：手动操作容易出错，且错误难以追溯
效率低下：无法充分利用计算资源
不一致性：不同任务参数设置可能不一致

1.3 Python自动化的优势

使用Python进行自动化带来多重好处：

一致性：确保所有计算使用相同的参数标准
可重复性：完整记录计算设置和流程
高效性：同时管理数百个计算任务
灵活性：可根据需要定制各种工作流

2. 环境设置与工具准备

2.1 所需Python库

# 安装必要库
pip install numpy matplotlib ase pymatgen paramiko scipy pandas

2.2 计算环境配置

# config.py - 配置文件
import os
from pathlib import Path

# 基础路径设置
BASE_DIR = Path("/path/to/your/workdir")
VASP_EXEC = "/path/to/vasp_std"  # VASP可执行文件路径
POTENTIAL_DIR = "/path/to/potentials"  # 赝势文件路径

# 远程服务器配置（如果使用远程计算）
SSH_CONFIG = {
    'hostname': 'hpc.server.edu',
    'username': 'your_username',
    'password': 'your_password',  # 建议使用密钥认证
    'port': 22
}

# 资源设置
RESOURCE_CONFIG = {
    'nodes': 1,
    'cores_per_node': 16,
    'memory': '64G',
    'walltime': '72:00:00'
}

# 计算参数默认值
DEFAULT_PARAMS = {
    'encut': 520,
    'kpoints': [3, 3, 3],
    'ismear': 1,
    'sigma': 0.2,
    'isif': 3,
    'ibrion': 2,
    'nsw': 100,
    'ediffg': -0.01
}

3. 核心功能实现

3.1 结构生成与处理

# structure_manager.py
from ase import Atoms
from ase.build import bulk, make_supercell
from pymatgen.core import Structure, Lattice
import numpy as np
import json

class StructureManager:
    def __init__(self, base_dir):
        self.base_dir = Path(base_dir)
        self.structures_dir = self.base_dir / "structures"
        self.structures_dir.mkdir(exist_ok=True)
    
    def create_bulk_structure(self, element, crystal_type, lattice_params, 
                            name=None):
        """创建体相晶体结构"""
        if crystal_type == 'sc':
            structure = bulk(element, crystal_type, a=lattice_params['a'])
        elif crystal_type == 'fcc':
            structure = bulk(element, crystal_type, a=lattice_params['a'])
        elif crystal_type == 'bcc':
            structure = bulk(element, crystal_type, a=lattice_params['a'])
        elif crystal_type == 'hcp':
            structure = bulk(element, crystal_type, a=lattice_params['a'], 
                           c=lattice_params['c'])
        else:
            raise ValueError(f"Unsupported crystal type: {crystal_type}")
        
        if name is None:
            name = f"{element}_{crystal_type}"
        
        # 保存结构
        self.save_structure(structure, name)
        return structure
    
    def create_alloy_structure(self, base_element, dopants, concentrations, 
                             crystal_type, lattice_params):
        """创建合金结构"""
        # 实现合金结构生成逻辑
        pass
    
    def apply_strain(self, structure, strain_tensor, name_suffix="_strained"):
        """应用应变到结构"""
        strained_structure = structure.copy()
        strained_structure.set_cell(
            np.dot(strained_structure.cell, strain_tensor), 
            scale_atoms=True
        )
        
        name = structure.info.get('name', 'unknown') + name_suffix
        self.save_structure(strained_structure, name)
        return strained_structure
    
    def save_structure(self, structure, name):
        """保存结构到文件"""
        # 保存为POSCAR格式
        poscar_path = self.structures_dir / f"POSCAR_{name}"
        structure.write(poscar_path, format='vasp')
        
        # 保存结构信息
        info_path = self.structures_dir / f"info_{name}.json"
        info = {
            'name': name,
            'formula': structure.get_chemical_formula(),
            'volume': structure.get_volume(),
            'lattice_parameters': structure.cell.lengths().tolist(),
            'angles': structure.cell.angles().tolist(),
            'natoms': len(structure)
        }
        
        with open(info_path, 'w') as f:
            json.dump(info, f, indent=2)
    
    def generate_strain_series(self, base_structure, strain_range, n_points=10):
        """生成应变序列"""
        strains = np.linspace(-strain_range, strain_range, n_points)
        strained_structures = []
        
        for i, strain in enumerate(strains):
            strain_tensor = np.array([
                [1 + strain, 0, 0],
                [0, 1 + strain, 0],
                [0, 0, 1 + strain]
            ])
            
            strained_struct = self.apply_strain(
                base_structure, strain_tensor, f"_strain_{i:02d}"
            )
            strained_structures.append(strained_struct)
        
        return strained_structures

3.2 输入文件生成

# input_generator.py
from pathlib import Path
import shutil

class VASPInputGenerator:
    def __init__(self, potential_dir):
        self.potential_dir = Path(potential_dir)
        self.templates = self._load_templates()
    
    def _load_templates(self):
        """加载模板文件"""
        templates = {}
        template_files = ['INCAR', 'KPOINTS', 'POTCAR']
        
        for tfile in template_files:
            template_path = self.potential_dir / f"template_{tfile}"
            if template_path.exists():
                with open(template_path, 'r') as f:
                    templates[tfile] = f.read()
        
        return templates
    
    def generate_incar(self, params, calculation_type='relax'):
        """生成INCAR文件"""
        base_incar = self.templates.get('INCAR', '')
        
        # 根据计算类型调整参数
        if calculation_type == 'relax':
            params.update({'IBRION': 2, 'NSW': 100, 'ISIF': 3})
        elif calculation_type == 'static':
            params.update({'IBRION': -1, 'NSW': 0, 'ISIF': 2})
        elif calculation_type == 'band':
            params.update({'IBRION': -1, 'NSW': 0, 'ICHARG': 11})
        
        # 替换参数
        incar_content = base_incar
        for key, value in params.items():
            incar_content = incar_content.replace(f'{{{{{key}}}}}', str(value))
        
        return incar_content
    
    def generate_kpoints(self, kpoints, mode='automatic'):
        """生成KPOINTS文件"""
        if mode == 'automatic':
            kpoints_content = f"""Automatic mesh
0
Monkhorst-Pack
{kpoints[0]} {kpoints[1]} {kpoints[2]}
0 0 0
"""
        return kpoints_content
    
    def generate_potcar(self, elements):
        """生成POTCAR文件"""
        potcar_content = b''
        for element in elements:
            pot_file = self.potential_dir / f"POTCAR_{element}"
            if pot_file.exists():
                with open(pot_file, 'rb') as f:
                    potcar_content += f.read()
            else:
                raise FileNotFoundError(f"Potential file not found for {element}")
        
        return potcar_content
    
    def prepare_calculation_directory(self, structure, calc_dir, 
                                   calculation_type='relax', **params):
        """准备计算目录"""
        calc_dir = Path(calc_dir)
        calc_dir.mkdir(parents=True, exist_ok=True)
        
        # 保存POSCAR
        structure.write(calc_dir / "POSCAR", format='vasp')
        
        # 生成INCAR
        incar_content = self.generate_incar(params, calculation_type)
        with open(calc_dir / "INCAR", 'w') as f:
            f.write(incar_content)
        
        # 生成KPOINTS
        kpoints_content = self.generate_kpoints(params.get('kpoints', [3,3,3]))
        with open(calc_dir / "KPOINTS", 'w') as f:
            f.write(kpoints_content)
        
        # 生成POTCAR
        elements = list(set(structure.get_chemical_symbols()))
        potcar_content = self.generate_potcar(elements)
        with open(calc_dir / "POTCAR", 'wb') as f:
            f.write(potcar_content)
        
        # 生成提交脚本
        self.generate_submission_script(calc_dir, calculation_type)
        
        return calc_dir
    
    def generate_submission_script(self, calc_dir, calculation_type):
        """生成作业提交脚本"""
        script_content = f"""#!/bin/bash
#SBATCH --job-name={calculation_type}
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=16
#SBATCH --time=72:00:00
#SBATCH --mem=64G

module load intel impi vasp

mpirun -np 16 /path/to/vasp_std > vasp.out
"""
        with open(calc_dir / "submit.sh", 'w') as f:
            f.write(script_content)
        
        # 设置执行权限
        (calc_dir / "submit.sh").chmod(0o755)

3.3 任务批量提交与管理

# task_manager.py
import paramiko
from pathlib import Path
import time
import json
from typing import List, Dict
import subprocess

class VASPTaskManager:
    def __init__(self, ssh_config=None):
        self.ssh_config = ssh_config
        self.ssh = None
        self.sftp = None
        self.tasks_db = {}
        
        if ssh_config:
            self._setup_ssh_connection()
    
    def _setup_ssh_connection(self):
        """建立SSH连接"""
        try:
            self.ssh = paramiko.SSHClient()
            self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
            self.ssh.connect(**self.ssh_config)
            self.sftp = self.ssh.open_sftp()
        except Exception as e:
            print(f"SSH连接失败: {e}")
            raise
    
    def submit_local_task(self, calc_dir):
        """提交本地计算任务"""
        try:
            result = subprocess.run(
                ['sbatch', 'submit.sh'],
                cwd=calc_dir,
                capture_output=True,
                text=True
            )
            
            if result.returncode == 0:
                job_id = result.stdout.split()[-1]
                self._register_task(calc_dir, job_id, 'local')
                return job_id
            else:
                raise Exception(f"提交失败: {result.stderr}")
                
        except Exception as e:
            print(f"任务提交错误: {e}")
            return None
    
    def submit_remote_task(self, local_dir, remote_dir):
        """提交远程计算任务"""
        try:
            # 上传计算目录
            self._upload_directory(local_dir, remote_dir)
            
            # 提交任务
            stdin, stdout, stderr = self.ssh.exec_command(
                f"cd {remote_dir} && sbatch submit.sh"
            )
            
            output = stdout.read().decode()
            error = stderr.read().decode()
            
            if output:
                job_id = output.split()[-1]
                self._register_task(local_dir, job_id, 'remote', remote_dir)
                return job_id
            else:
                raise Exception(f"远程提交失败: {error}")
                
        except Exception as e:
            print(f"远程任务提交错误: {e}")
            return None
    
    def _upload_directory(self, local_dir, remote_dir):
        """上传整个目录到远程服务器"""
        # 创建远程目录
        self.ssh.exec_command(f"mkdir -p {remote_dir}")
        
        # 上传所有文件
        local_path = Path(local_dir)
        for file_path in local_path.iterdir():
            if file_path.is_file():
                remote_path = f"{remote_dir}/{file_path.name}"
                self.sftp.put(str(file_path), remote_path)
    
    def _register_task(self, calc_dir, job_id, task_type, remote_dir=None):
        """注册任务到数据库"""
        task_info = {
            'job_id': job_id,
            'calc_dir': str(calc_dir),
            'type': task_type,
            'remote_dir': remote_dir,
            'status': 'submitted',
            'submit_time': time.time(),
            'completion_time': None
        }
        
        self.tasks_db[job_id] = task_info
        
        # 保存到文件
        self._save_task_db()
    
    def monitor_tasks(self, interval=300):
        """监控任务状态"""
        while True:
            completed_tasks = 0
            running_tasks = 0
            failed_tasks = 0
            
            for job_id, task_info in self.tasks_db.items():
                if task_info['status'] in ['completed', 'failed']:
                    completed_tasks += 1
                    continue
                
                status = self._check_task_status(job_id, task_info['type'])
                
                if status == 'completed':
                    task_info['status'] = 'completed'
                    task_info['completion_time'] = time.time()
                    completed_tasks += 1
                    self._process_completed_task(task_info)
                elif status == 'failed':
                    task_info['status'] = 'failed'
                    failed_tasks += 1
                else:
                    running_tasks += 1
            
            print(f"\n任务状态: 运行中 {running_tasks}, 完成 {completed_tasks}, 失败 {failed_tasks}")
            
            if running_tasks == 0:
                break
                
            time.sleep(interval)
    
    def _check_task_status(self, job_id, task_type):
        """检查任务状态"""
        if task_type == 'local':
            return self._check_local_task_status(job_id)
        else:
            return self._check_remote_task_status(job_id)
    
    def _check_local_task_status(self, job_id):
        """检查本地任务状态"""
        try:
            result = subprocess.run(
                ['squeue', '-j', job_id],
                capture_output=True,
                text=True
            )
            
            if job_id not in result.stdout:
                # 检查输出文件确定是否完成
                calc_dir = self.tasks_db[job_id]['calc_dir']
                if self._is_calculation_complete(calc_dir):
                    return 'completed'
                else:
                    return 'failed'
            return 'running'
            
        except Exception as e:
            print(f"检查任务状态错误: {e}")
            return 'unknown'
    
    def _is_calculation_complete(self, calc_dir):
        """检查计算是否完成"""
        out_file = Path(calc_dir) / "vasp.out"
        if out_file.exists():
            with open(out_file, 'r') as f:
                content = f.read()
                return 'General timing and accounting' in content
        return False

    def _process_completed_task(self, task_info):
        """处理完成的任务"""
        calc_dir = task_info['calc_dir']
        print(f"任务 {task_info['job_id']} 完成，开始处理结果...")
        
        # 下载远程结果（如果是远程任务）
        if task_info['type'] == 'remote' and task_info['remote_dir']:
            self._download_results(task_info['remote_dir'], calc_dir)
        
        # 分析结果
        self.analyze_results(calc_dir)
    
    def analyze_results(self, calc_dir):
        """分析计算结果"""
        # 实现结果分析逻辑
        pass
    
    def _save_task_db(self):
        """保存任务数据库"""
        with open('task_database.json', 'w') as f:
            json.dump(self.tasks_db, f, indent=2)

3.4 结果分析与提取

# result_analyzer.py
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from ase.io import read
import re

class VASPResultAnalyzer:
    def __init__(self, base_dir):
        self.base_dir = Path(base_dir)
        self.results = {}
    
    def parse_vasprun(self, calc_dir):
        """解析vasprun.xml文件"""
        vasprun_path = calc_dir / "vasprun.xml"
        if vasprun_path.exists():
            try:
                atoms = read(vasprun_path, index=-1)  # 读取最终结构
                return {
                    'final_energy': atoms.get_potential_energy(),
                    'final_structure': atoms,
                    'forces': atoms.get_forces(),
                    'stresses': atoms.get_stress(),
                    'volume': atoms.get_volume(),
                    'converged': self._check_convergence(calc_dir)
                }
            except Exception as e:
                print(f"解析vasprun.xml错误: {e}")
                return None
        return None
    
    def parse_outcar(self, calc_dir):
        """解析OUTCAR文件"""
        outcar_path = calc_dir / "OUTCAR"
        results = {}
        
        if outcar_path.exists():
            with open(outcar_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            # 提取弹性常数
            elastic_pattern = r'TOTAL ELASTIC MODULI \(kBar\)\s*([\s\S]*?)---'
            elastic_match = re.search(elastic_pattern, content)
            if elastic_match:
                results['elastic_constants'] = self._parse_elastic_constants(
                    elastic_match.group(1)
                )
            
            # 提取能带结构信息
            band_pattern = r'k-point\s+\d+ :\s+([\d\.\-Ee+]+)\s+([\d\.\-Ee+]+)'
            band_matches = re.findall(band_pattern, content)
            if band_matches:
                results['band_structure'] = self._parse_band_structure(band_matches)
        
        return results
    
    def _check_convergence(self, calc_dir):
        """检查计算是否收敛"""
        outcar_path = calc_dir / "OUTCAR"
        if outcar_path.exists():
            with open(outcar_path, 'r') as f:
                for line in f:
                    if 'reached required accuracy' in line:
                        return True
        return False
    
    def analyze_energy_volume(self, task_dirs, label_func=None):
        """分析能量-体积关系"""
        energies = []
        volumes = []
        labels = []
        
        for task_dir in task_dirs:
            result = self.parse_vasprun(task_dir)
            if result and result['converged']:
                energies.append(result['final_energy'])
                volumes.append(result['volume'])
                
                if label_func:
                    labels.append(label_func(task_dir))
                else:
                    labels.append(task_dir.name)
        
        # 拟合状态方程
        if len(energies) > 3:
            eos_fit = self._fit_eos(volumes, energies)
        else:
            eos_fit = None
        
        return {
            'energies': energies,
            'volumes': volumes,
            'labels': labels,
            'eos_fit': eos_fit
        }
    
    def _fit_eos(self, volumes, energies):
        """拟合状态方程"""
        try:
            from ase.eos import EquationOfState
            eos = EquationOfState(volumes, energies)
            eos.fit()
            return {
                'v0': eos.v0,
                'e0': eos.e0,
                'b0': eos.b0,
                'b1': eos.b1
            }
        except Exception as e:
            print(f"状态方程拟合错误: {e}")
            return None
    
    def generate_report(self, task_dirs, output_file="report.pdf"):
        """生成分析报告"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # 能量-体积曲线
        ev_data = self.analyze_energy_volume(task_dirs)
        axes[0, 0].plot(ev_data['volumes'], ev_data['energies'], 'o-')
        axes[0, 0].set_xlabel('Volume (Å³)')
        axes[0, 0].set_ylabel('Energy (eV)')
        axes[0, 0].set_title('Energy-Volume Curve')
        
        # 其他分析图表可以继续添加...
        
        plt.tight_layout()
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        
        return output_file

4. 完整工作流示例

4.1 批量结构优化

# bulk_relaxation.py
from structure_manager import StructureManager
from input_generator import VASPInputGenerator
from task_manager import VASPTaskManager
from config import BASE_DIR, DEFAULT_PARAMS

def run_bulk_relaxation_workflow():
    """运行体相结构优化工作流"""
    
    # 初始化管理器
    struct_manager = StructureManager(BASE_DIR)
    input_gen = VASPInputGenerator(POTENTIAL_DIR)
    task_manager = VASPTaskManager()  # 本地运行
    
    # 创建不同元素的结构
    elements = ['Si', 'Ge', 'C', 'SiC']
    structures = []
    
    for element in elements[:3]:  # 纯元素
        if element == 'C':
            structure = struct_manager.create_bulk_structure(
                element, 'diamond', {'a': 3.57}
            )
        else:
            structure = struct_manager.create_bulk_structure(
                element, 'diamond', {'a': 5.43}
            )
        structures.append(structure)
    
    # SiC化合物
    si_c_structure = struct_manager.create_bulk_structure(
        'SiC', 'zincblende', {'a': 4.36}
    )
    structures.append(si_c_structure)
    
    # 准备计算任务
    task_dirs = []
    for i, structure in enumerate(structructures):
        calc_dir = BASE_DIR / f"calc_{i:02d}"
        input_gen.prepare_calculation_directory(
            structure, calc_dir, 'relax', **DEFAULT_PARAMS
        )
        task_dirs.append(calc_dir)
    
    # 提交任务
    job_ids = []
    for calc_dir in task_dirs:
        job_id = task_manager.submit_local_task(calc_dir)
        if job_id:
            job_ids.append(job_id)
            print(f"提交任务: {calc_dir.name}, Job ID: {job_id}")
    
    # 监控任务
    print("开始监控任务...")
    task_manager.monitor_tasks(interval=600)  # 每10分钟检查一次
    
    # 分析结果
    analyzer = VASPResultAnalyzer(BASE_DIR)
    report_file = analyzer.generate_report(task_dirs)
    print(f"分析报告已生成: {report_file}")

if __name__ == "__main__":
    run_bulk_relaxation_workflow()

4.2 应变扫描计算

# strain_sweep.py
def run_strain_sweep_workflow():
    """运行应变扫描工作流"""
    
    # 创建基础结构
    struct_manager = StructureManager(BASE_DIR)
    base_structure = struct_manager.create_bulk_structure(
        'Si', 'diamond', {'a': 5.43}, name='Si_base'
    )
    
    # 生成应变序列
    strained_structures = struct_manager.generate_strain_series(
        base_structure, strain_range=0.1, n_points=21
    )
    
    # 准备计算任务
    input_gen = VASPInputGenerator(POTENTIAL_DIR)
    task_manager = VASPTaskManager()
    
    task_dirs = []
    for i, structure in enumerate(strained_structures):
        calc_dir = BASE_DIR / f"strain_calc_{i:02d}"
        
        # 使用静态计算（结构已 strained，只计算能量）
        input_gen.prepare_calculation_directory(
            structure, calc_dir, 'static', **DEFAULT_PARAMS
        )
        task_dirs.append(calc_dir)
    
    # 提交和监控任务
    job_ids = []
    for calc_dir in task_dirs:
        job_id = task_manager.submit_local_task(calc_dir)
        job_ids.append(job_id)
    
    task_manager.monitor_tasks()
    
    # 分析应变-能量关系
    analyzer = VASPResultAnalyzer(BASE_DIR)
    results = analyzer.analyze_energy_volume(
        task_dirs, 
        label_func=lambda x: float(x.name.split('_')[-1])
    )
    
    # 绘制应变-能量曲线
    plt.figure(figsize=(10, 6))
    strains = [label for label in results['labels']]
    plt.plot(strains, results['energies'], 'o-')
    plt.xlabel('Strain')
    plt.ylabel('Energy (eV)')
    plt.title('Strain-Energy Relationship')
    plt.grid(True)
    plt.savefig('strain_energy_curve.png', dpi=300, bbox_inches='tight')

5. 高级功能与最佳实践

5.1 错误处理与重试机制

# error_handler.py
import time
from typing import Callable

def retry_with_backoff(func: Callable, max_retries: int = 3, 
                      initial_delay: float = 1.0, backoff_factor: float = 2.0):
    """带指数退避的重试装饰器"""
    def wrapper(*args, **kwargs):
        retries = 0
        delay = initial_delay
        
        while retries < max_retries:
            try:
                return func(*args, **kwargs)
            except Exception as e:
                retries += 1
                if retries == max_retries:
                    raise e
                
                print(f"尝试 {retries}/{max_retries} 失败: {e}")
                print(f"等待 {delay} 秒后重试...")
                time.sleep(delay)
                delay *= backoff_factor
        
        raise Exception("所有重试尝试均失败")
    
    return wrapper

class ErrorHandler:
    def __init__(self, task_manager):
        self.task_manager = task_manager
    
    def handle_failed_tasks(self):
        """处理失败的任务"""
        failed_tasks = [
            job_id for job_id, info in self.task_manager.tasks_db.items()
            if info['status'] == 'failed'
        ]
        
        for job_id in failed_tasks:
            task_info = self.task_manager.tasks_db[job_id]
            print(f"处理失败任务: {job_id}")
            
            # 分析失败原因
            error_type = self._analyze_failure(task_info['calc_dir'])
            
            # 根据错误类型采取不同措施
            if error_type == 'convergence':
                self._handle_convergence_failure(task_info)
            elif error_type == 'memory':
                self._handle_memory_failure(task_info)
            else:
                self._handle_unknown_failure(task_info)
    
    def _analyze_failure(self, calc_dir):
        """分析失败原因"""
        out_file = Path(calc_dir) / "vasp.out"
        if out_file.exists():
            with open(out_file, 'r') as f:
                content = f.read()
            
            if 'ZBRENT: fatal error' in content:
                return 'convergence'
            elif 'out of memory' in content:
                return 'memory'
            elif 'ERROR' in content:
                return 'unknown_error'
        
        return 'unknown'

5.2 资源优化策略

# resource_optimizer.py
class ResourceOptimizer:
    def __init__(self, task_manager):
        self.task_manager = task_manager
        self.performance_data = []
    
    def optimize_resources(self):
        """优化资源分配"""
        # 分析历史性能数据
        if self.performance_data:
            optimal_config = self._find_optimal_config()
            self._apply_configuration(optimal_config)
    
    def _find_optimal_config(self):
        """寻找最优资源配置"""
        # 基于历史数据寻找最佳配置
        return {
            'nodes': 1,
            'cores_per_node': 16,
            'memory': '64G'
        }
    
    def monitor_performance(self):
        """监控性能并收集数据"""
        while True:
            for job_id, task_info in self.task_manager.tasks_db.items():
                if task_info['status'] == 'running':
                    performance = self._measure_performance(task_info)
                    self.performance_data.append(performance)
            
            time.sleep(300)  # 每5分钟监控一次