R语言学习笔记-使用R语言进行数据分析-线性回归算法-以墨尔本房价为例(该文章参考的其他文章)

本文链接：https://blog.csdn.net/CSDNXXCQ/article/details/132201241

先进行简单的测似(后面将代码用自己的环境重新运行一遍)

#数据集样式

# Values of height
151, 174, 138, 186, 128, 136, 179, 163, 152, 131

# Values of weight.
63, 81, 56, 91, 47, 57, 76, 72, 62, 48

#线性回归函数
lm(formula,data)

x <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131)
y <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48)
# Apply the lm() function.
relation <- lm(y~x)

print(relation)

在这里插入图片描述

#获取总结信息
x <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131)
y <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48)
# Apply the lm() function.
relation <- lm(y~x)

print(summary(relation))

输出
在这里插入图片描述
进行数值预测

predict(object, newdata)

预测一个170人的身高体重

# The predictor vector.
x <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131)
# The resposne vector.
y <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48)
# Apply the lm() function.
relation <- lm(y~x)


# Find weight of a person with height 170.
a <- data.frame(x = 170)
result <-  predict(relation,a)
print(result)

以图形方式展示(这一段代码有错没法运行)

# Create the predictor and response variable.
x <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131)
y <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48)
relation <- lm(y ~ x)

# Give the chart file a name.
png(file = "linearregression.png")

# Plot the chart.
plot(x, y, col = "blue", main = "Height & Weight Regression",
     abline(relation), cex = 1.3, pch = 16, xlab = "Weight in Kg", ylab = "Height in cm")

# Save the file.
dev.off()

多变量之间的关联

考虑在R语言环境中可用的数据集“mtcars”。它给出了每加仑里程（mpg），气缸排量（“disp”），马力（“hp”），汽车重量（“wt”）和一些其他参数的不同汽车模型之间的比较。

模型的目标是建立“mpg”作为响应变量与“disp”，“hp”和“wt”作为预测变量之间的关系。

#显示数据集看看
input <- mtcars[,c("mpg","disp","hp","wt")]
print(head(input))

输出结果
在这里插入图片描述
创建关系模型并获取系数

input <- mtcars[,c("mpg","disp","hp","wt")]

# Create the relationship model.
model <- lm(mpg~disp+hp+wt, data = input)

# Show the model.
print(model)

# Get the Intercept and coefficients as vector elements.
cat("# # # # The Coefficient Values # # # ","
")

#使用coef（）读取构建模型model的系数
a <- coef(model)[1]
Xdisp <- coef(model)[2]
Xhp <- coef(model)[3]
Xwt <- coef(model)[4]
print(a)
print(Xdisp)
print(Xhp)
print(Xwt)

输出结果
在这里插入图片描述

#在刚刚的代码基础上添加模型总结
print (summary(model))

输出结果
在这里插入图片描述

在这个标题之前的均为教程的内容，以下为本次小demo

数据的基本情况

通过直接打开进行预览
在这里插入图片描述
这里的文件路径要替换

#数据集读取
read.csv("C:/Users/现在，是肖晓晨/Desktop/学校项目A/悉尼大学数据科学学习资料/5003R语言学习资料/作业1/Melbourne_housing_FULL.csv", header=T, na.strings=c("NA"))

输出结果
在这里插入图片描述

chatgpt给出的答案

导入所需的库和数据集

# 导入所需库
library(dplyr)   # 数据处理
library(ggplot2) # 数据可视化
library(caret)   # 模型评估

# 读取CSV文件
data <- read.csv("C:/Users/现在，是肖晓晨/Desktop/学校项目A/悉尼大学数据科学学习资料/5003R语言学习资料/作业1Melbourne_housing_FULL.csv")

数据预处理和探索

# 查看数据的前几行
head(data)

# 数据摘要
summary(data)

# 处理缺失值（这里假设缺失值以NA表示）
data_clean <- na.omit(data)

# 探索变量之间的关系
cor(data_clean[, c("Price", "Rooms", "Distance", "Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea", "YearBuilt")])

拟合线性回归模型

# 拟合线性回归模型
lm_model <- lm(Price ~ Rooms + Distance + Bedroom2 + Bathroom + Car + Landsize + BuildingArea + YearBuilt, data = data_clean)

# 查看模型摘要
summary(lm_model)

模型评估

# 使用交叉验证进行模型评估
cv_results <- trainControl(method = "cv", number = 10)
cv_lm <- train(Price ~ Rooms + Distance + Bedroom2 + Bathroom + Car + Landsize + BuildingArea + YearBuilt, data = data_clean, method = "lm", trControl = cv_results)

# 查看交叉验证结果
print(cv_lm)

可视化分析

# 绘制残差图
plot(lm_model, which = 1)

# 绘制Q-Q图
plot(lm_model, which = 2)

不依赖特定的包,该版本可运行

# 读取CSV文件
data <- read.csv("C:/Users/现在，是肖晓晨/Desktop/学校项目A/悉尼大学数据科学学习资料/5003R语言学习资料/作业1/Melbourne_housing_FULL.csv")

# 处理缺失值
data_clean <- na.omit(data)

# 拟合线性回归模型
lm_model <- lm(Price ~ Rooms + Distance + Bedroom2 + Bathroom + Car + Landsize + BuildingArea + YearBuilt, data = data_clean)

# 查看模型摘要
summary(lm_model)

# 手动实现交叉验证
set.seed(123)  # 设置随机种子
num_folds <- 10
fold_size <- nrow(data_clean) %/% num_folds

rmse_values <- numeric(num_folds)

for (i in 1:num_folds) {
  fold_start <- (i - 1) * fold_size + 1
  fold_end <- i * fold_size
  
  validation_data <- data_clean[fold_start:fold_end, ]
  train_data <- data_clean[-c(fold_start:fold_end), ]
  
  lm_fold <- lm(Price ~ Rooms + Distance + Bedroom2 + Bathroom + Car + Landsize + BuildingArea + YearBuilt, data = train_data)
  
  predictions <- predict(lm_fold, newdata = validation_data)
  
  rmse_values[i] <- sqrt(mean((predictions - validation_data$Price)^2))
}

# 查看交叉验证结果（RMSE）
print(rmse_values)

# 绘制残差图
plot(lm_model, which = 1)

# 绘制Q-Q图
plot(lm_model, which = 2)