本文章是用于将一些文件直接通过脚本导入到MongoDB,传入文件名,db名和对应的连接名即可
import csv
import json
import os
import sys
import yaml
import xml.etree.ElementTree as ET
from pymongo import MongoClient
from bson import json_util
from openpyxl import load_workbook
def import_file_to_mongodb(file_path, db_name, collection_name, host='localhost', port=27017):
"""
将本地文件导入到MongoDB
Args:
file_path: 文件路径
db_name: 数据库名称
collection_name: 集合名称
host: MongoDB主机地址
port: MongoDB端口
"""
# 连接MongoDB
try:
client = MongoClient(host, port)
db = client[db_name]
collection = db[collection_name]
print(f"成功连接到MongoDB: {host}:{port}")
except Exception as e:
print(f"连接MongoDB失败: {e}")
return
# 获取文件扩展名
file_ext = os.path.splitext(file_path)[1].lower()
# 根据文件类型进行导入
try:
if file_ext == '.csv':
# 导入CSV文件
with open(file_path, 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
data = list(reader)
if data:
result = collection.insert_many(data)
print(f"成功导入 {len(result.inserted_ids)} 条记录到集合 {collection_name}")
else:
print("CSV文件为空,未导入任何数据")
elif file_ext == '.json':
# 导入JSON文件
with open(file_path, 'r', encoding='utf-8') as file:
try:
# 尝试解析为JSON数组
data = json.load(file)
if isinstance(data, list):
if data:
result = collection.insert_many(data)
print(f"成功导入 {len(result.inserted_ids)} 条记录到集合 {collection_name}")
else:
print("JSON文件为空数组,未导入任何数据")
else:
# 如果是单个JSON对象,转换为数组
result = collection.insert_one(data)
print(f"成功导入1条记录到集合 {collection_name}")
except json.JSONDecodeError:
# 如果是每行一个JSON对象的格式
data = []
for line in file:
if line.strip():
data.append(json.loads(line))
if data:
result = collection.insert_many(data)
print(f"成功导入 {len(result.inserted_ids)} 条记录到集合 {collection_name}")
else:
print("JSONL文件为空,未导入任何数据")
elif file_ext in ['.xlsx', '.xls']:
# 导入Excel文件
wb = load_workbook(file_path, read_only=True)
ws = wb.active
headers = [cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1, values_only=True))]
data = []
for row in ws.iter_rows(min_row=2, values_only=True):
row_data = {headers[i]: cell for i, cell in enumerate(row)}
data.append(row_data)
wb.close()
if data:
result = collection.insert_many(data)
print(f"成功导入 {len(result.inserted_ids)} 条记录到集合 {collection_name}")
else:
print("Excel文件为空,未导入任何数据")
elif file_ext == '.tsv':
# 导入TSV文件(制表符分隔)
with open(file_path, 'r', encoding='utf-8') as file:
reader = csv.DictReader(file, delimiter='\t')
data = list(reader)
if data:
result = collection.insert_many(data)
print(f"成功导入 {len(result.inserted_ids)} 条记录到集合 {collection_name}")
else:
print("TSV文件为空,未导入任何数据")
elif file_ext == '.xml':
# 导入XML文件
tree = ET.parse(file_path)
root = tree.getroot()
data = []
# 假设XML根节点下的每个子节点是一条记录
for record in root:
record_data = {}
for element in record:
# 处理嵌套结构
if list(element):
nested_data = {}
for nested in element:
nested_data[nested.tag] = nested.text
record_data[element.tag] = nested_data
else:
record_data[element.tag] = element.text
data.append(record_data)
if data:
result = collection.insert_many(data)
print(f"成功导入 {len(result.inserted_ids)} 条记录到集合 {collection_name}")
else:
print("XML文件为空,未导入任何数据")
elif file_ext in ['.yaml', '.yml']:
# 导入YAML文件
with open(file_path, 'r', encoding='utf-8') as file:
data = yaml.safe_load(file)
if isinstance(data, list):
if data:
result = collection.insert_many(data)
print(f"成功导入 {len(result.inserted_ids)} 条记录到集合 {collection_name}")
else:
print("YAML文件为空数组,未导入任何数据")
else:
# 如果是单个YAML对象,转换为数组
result = collection.insert_one(data)
print(f"成功导入1条记录到集合 {collection_name}")
else:
print(f"不支持的文件格式: {file_ext}")
except Exception as e:
print(f"导入文件失败: {e}")
finally:
# 关闭连接
client.close()
print("已关闭MongoDB连接")
if __name__ == "__main__":
import_file_to_mongodb(file_path, db_name, collection_name)