先进行简单的测似(后面将代码用自己的环境重新运行一遍)
#数据集样式
# Values of height
151, 174, 138, 186, 128, 136, 179, 163, 152, 131
# Values of weight.
63, 81, 56, 91, 47, 57, 76, 72, 62, 48
#线性回归函数
lm(formula,data)
x <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131)
y <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48)
# Apply the lm() function.
relation <- lm(y~x)
print(relation)
#获取总结信息
x <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131)
y <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48)
# Apply the lm() function.
relation <- lm(y~x)
print(summary(relation))
输出
进行数值预测
predict(object, newdata)
预测一个170人的身高体重
# The predictor vector.
x <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131)
# The resposne vector.
y <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48)
# Apply the lm() function.
relation <- lm(y~x)
# Find weight of a person with height 170.
a <- data.frame(x = 170)
result <- predict(relation,a)
print(result)
以图形方式展示(这一段代码有错没法运行)
# Create the predictor and response variable.
x <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131)
y <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48)
relation <- lm(y ~ x)
# Give the chart file a name.
png(file = "linearregression.png")
# Plot the chart.
plot(x, y, col = "blue", main = "Height & Weight Regression",
abline(relation), cex = 1.3, pch = 16, xlab = "Weight in Kg", ylab = "Height in cm")
# Save the file.
dev.off()
多变量之间的关联
考虑在R语言环境中可用的数据集“mtcars”。 它给出了每加仑里程(mpg),气缸排量(“disp”),马力(“hp”),汽车重量(“wt”)和一些其他参数的不同汽车模型之间的比较。
模型的目标是建立“mpg”作为响应变量与“disp”,“hp”和“wt”作为预测变量之间的关系。
#显示数据集看看
input <- mtcars[,c("mpg","disp","hp","wt")]
print(head(input))
输出结果
创建关系模型并获取系数
input <- mtcars[,c("mpg","disp","hp","wt")]
# Create the relationship model.
model <- lm(mpg~disp+hp+wt, data = input)
# Show the model.
print(model)
# Get the Intercept and coefficients as vector elements.
cat("# # # # The Coefficient Values # # # ","
")
#使用coef()读取构建模型model的系数
a <- coef(model)[1]
Xdisp <- coef(model)[2]
Xhp <- coef(model)[3]
Xwt <- coef(model)[4]
print(a)
print(Xdisp)
print(Xhp)
print(Xwt)
输出结果
#在刚刚的代码基础上添加模型总结
print (summary(model))
输出结果
在这个标题之前的均为教程的内容,以下为本次小demo
数据的基本情况
通过直接打开进行预览
这里的文件路径要替换
#数据集读取
read.csv("C:/Users/现在,是肖晓晨/Desktop/学校项目A/悉尼大学数据科学学习资料/5003R语言学习资料/作业1/Melbourne_housing_FULL.csv", header=T, na.strings=c("NA"))
输出结果
chatgpt给出的答案
导入所需的库和数据集
# 导入所需库
library(dplyr) # 数据处理
library(ggplot2) # 数据可视化
library(caret) # 模型评估
# 读取CSV文件
data <- read.csv("C:/Users/现在,是肖晓晨/Desktop/学校项目A/悉尼大学数据科学学习资料/5003R语言学习资料/作业1Melbourne_housing_FULL.csv")
数据预处理和探索
# 查看数据的前几行
head(data)
# 数据摘要
summary(data)
# 处理缺失值(这里假设缺失值以NA表示)
data_clean <- na.omit(data)
# 探索变量之间的关系
cor(data_clean[, c("Price", "Rooms", "Distance", "Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea", "YearBuilt")])
拟合线性回归模型
# 拟合线性回归模型
lm_model <- lm(Price ~ Rooms + Distance + Bedroom2 + Bathroom + Car + Landsize + BuildingArea + YearBuilt, data = data_clean)
# 查看模型摘要
summary(lm_model)
模型评估
# 使用交叉验证进行模型评估
cv_results <- trainControl(method = "cv", number = 10)
cv_lm <- train(Price ~ Rooms + Distance + Bedroom2 + Bathroom + Car + Landsize + BuildingArea + YearBuilt, data = data_clean, method = "lm", trControl = cv_results)
# 查看交叉验证结果
print(cv_lm)
可视化分析
# 绘制残差图
plot(lm_model, which = 1)
# 绘制Q-Q图
plot(lm_model, which = 2)
不依赖特定的包,该版本可运行
# 读取CSV文件
data <- read.csv("C:/Users/现在,是肖晓晨/Desktop/学校项目A/悉尼大学数据科学学习资料/5003R语言学习资料/作业1/Melbourne_housing_FULL.csv")
# 处理缺失值
data_clean <- na.omit(data)
# 拟合线性回归模型
lm_model <- lm(Price ~ Rooms + Distance + Bedroom2 + Bathroom + Car + Landsize + BuildingArea + YearBuilt, data = data_clean)
# 查看模型摘要
summary(lm_model)
# 手动实现交叉验证
set.seed(123) # 设置随机种子
num_folds <- 10
fold_size <- nrow(data_clean) %/% num_folds
rmse_values <- numeric(num_folds)
for (i in 1:num_folds) {
fold_start <- (i - 1) * fold_size + 1
fold_end <- i * fold_size
validation_data <- data_clean[fold_start:fold_end, ]
train_data <- data_clean[-c(fold_start:fold_end), ]
lm_fold <- lm(Price ~ Rooms + Distance + Bedroom2 + Bathroom + Car + Landsize + BuildingArea + YearBuilt, data = train_data)
predictions <- predict(lm_fold, newdata = validation_data)
rmse_values[i] <- sqrt(mean((predictions - validation_data$Price)^2))
}
# 查看交叉验证结果(RMSE)
print(rmse_values)
# 绘制残差图
plot(lm_model, which = 1)
# 绘制Q-Q图
plot(lm_model, which = 2)