Numpy NP Pandas PD Matplotlib - Pyplot PLT Seaborn SNS: "Merged - Uscol - TXT" ","
Numpy NP Pandas PD Matplotlib - Pyplot PLT Seaborn SNS: "Merged - Uscol - TXT" ","
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
Out[2]:
University of
1 1004 AL 1 NaN
Montevallo
Auburn
2 1009 University-Main AL 1 575.0
Campus
Birmingham-
3 1012 Southern AL 2 575.0
College
University of
4 1016 AL 1 NaN
North Alabama
5 rows × 51 columns
In [3]:
localhost:8888/lab 1/18
7/30/2020 Exploratory Data Analysis
In [4]:
Out[4]:
University of
1 1004 AL 1 NaN
Montevallo
Auburn
2 1009 University-Main AL 1 575.0
Campus
Birmingham-
3 1012 Southern AL 2 575.0
College
University of
4 1016 AL 1 NaN
North Alabama
5 rows × 51 columns
In [5]:
# Let's replace our NaN values with the mean of the corresponding column:
df.fillna(df.mean(), inplace=True, axis=0)
In [6]:
df.head()
Out[6]:
University of
1 1004 AL 1 512.605144
Montevallo
Auburn
2 1009 University-Main AL 1 575.000000
Campus
Birmingham-
3 1012 Southern AL 2 575.000000
College
University of
4 1016 AL 1 512.605144
North Alabama
5 rows × 51 columns
localhost:8888/lab 2/18
7/30/2020 Exploratory Data Analysis
In [7]:
localhost:8888/lab 3/18
7/30/2020 Exploratory Data Analysis
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1133 entries, 0 to 1132
Data columns (total 51 columns):
# Column Non-Null Count Dt
ype
--- ------ -------------- --
---
0 FICE 1133 non-null in
t64
1 College_name.x 1133 non-null ob
ject
2 States 1133 non-null ob
ject
3 Public_indicator 1133 non-null in
t64
4 Average_Math_SAT_score 1133 non-null fl
oat64
5 Average_Verbal_SAT_score 1133 non-null fl
oat64
6 Average_Combined_SAT_score 1133 non-null fl
oat64
7 Average_ACT_score 1133 non-null fl
oat64
8 First_quartile_Math_SAT 1133 non-null fl
oat64
9 Third_quartile_Math_SAT 1133 non-null fl
oat64
10 First_quartile_Verbal_SAT 1133 non-null fl
oat64
11 Third_quartile_Verbal_SAT 1133 non-null fl
oat64
12 First_quartile_ACT 1133 non-null fl
oat64
13 Third_quartile_ACT 1133 non-null fl
oat64
14 Number_applications_received 1133 non-null fl
oat64
15 Number_applicants_accepted 1133 non-null fl
oat64
16 Number_new_students_enrolled 1133 non-null fl
oat64
17 new_students_from_top_ten_percent_HS_class 1133 non-null fl
oat64
18 students_from_top_twenty_five_percent_of_HS_class 1133 non-null fl
oat64
19 Number_fulltime_undergraduates 1133 non-null fl
oat64
20 Number_parttime_undergraduates 1133 non-null fl
oat64
21 In_state_tuition 1133 non-null fl
oat64
22 Out_state_tuition 1133 non-null fl
oat64
23 Room_and_board_costs 1133 non-null fl
oat64
24 Room_costs 1133 non-null fl
oat64
25 Board_costs 1133 non-null fl
oat64
26 Additional_fees 1133 non-null fl
oat64
localhost:8888/lab 4/18
7/30/2020 Exploratory Data Analysis
localhost:8888/lab 5/18
7/30/2020 Exploratory Data Analysis
In [8]:
Out[8]:
8 rows × 46 columns
localhost:8888/lab 6/18
7/30/2020 Exploratory Data Analysis
In [9]:
Out[9]:
FICE 1132
College_name.x 1110
States 51
Public_indicator 2
Average_Math_SAT_score 227
Average_Verbal_SAT_score 206
Average_Combined_SAT_score 315
Average_ACT_score 17
First_quartile_Math_SAT 83
Third_quartile_Math_SAT 80
First_quartile_Verbal_SAT 65
Third_quartile_Verbal_SAT 82
First_quartile_ACT 21
Third_quartile_ACT 20
Number_applications_received 1007
Number_applicants_accepted 974
Number_new_students_enrolled 804
new_students_from_top_ten_percent_HS_class 89
students_from_top_twenty_five_percent_of_HS_class 91
Number_fulltime_undergraduates 1018
Number_parttime_undergraduates 809
In_state_tuition 850
Out_state_tuition 868
Room_and_board_costs 735
Room_costs 548
Board_costs 434
Additional_fees 410
Estimated_book_costs 157
Estimated_personal_spending 376
Pct_of_faculty_with_PhD 88
Pct_of_faculty_with_terminal_degree 75
Student_and_faculty_ratio 197
Pct_alumni_who_donate 62
Instructional_expenditure_per_student 1051
Graduation_rate 89
College_name.y 1112
State 52
Type 4
Average_salary_full_professors 428
Average_salary_associate_professors 303
Average_salary_assistant_professors 235
Average_salary_all_ranks 343
Average_compensation_full_professors 486
Average_compensation_associate_professors 373
Average_compensation_assistant_professors 305
Average_compensation_all_ranks 431
Number_of_full_professors 298
Number_of_associate_professors 255
Number_of_assistant_professors 241
Number_of_instructors 83
Number_of_faculty_all_ranks 493
dtype: int64
localhost:8888/lab 7/18
7/30/2020 Exploratory Data Analysis
In [10]:
# Return the counts of all the categorical values in the "Type" column:
df["Type"].value_counts()
Out[10]:
IIB 598
IIA 356
I 178
VIIB 1
Name: Type, dtype: int64
In [11]:
# Drop all categorical columns except "Type" as we convert this to a numerical column!
df.drop(["College_name.x", "States", "College_name.y", "State"], axis=1,inplace=True)
In [12]:
localhost:8888/lab 8/18
7/30/2020 Exploratory Data Analysis
In [13]:
# Check that our "Type" column has been replaced with numerical columns for "Type":
df.columns.tolist()
Out[13]:
['FICE',
'Public_indicator',
'Average_Math_SAT_score',
'Average_Verbal_SAT_score',
'Average_Combined_SAT_score',
'Average_ACT_score',
'First_quartile_Math_SAT',
'Third_quartile_Math_SAT',
'First_quartile_Verbal_SAT',
'Third_quartile_Verbal_SAT',
'First_quartile_ACT',
'Third_quartile_ACT',
'Number_applications_received',
'Number_applicants_accepted',
'Number_new_students_enrolled',
'new_students_from_top_ten_percent_HS_class',
'students_from_top_twenty_five_percent_of_HS_class',
'Number_fulltime_undergraduates',
'Number_parttime_undergraduates',
'In_state_tuition',
'Out_state_tuition',
'Room_and_board_costs',
'Room_costs',
'Board_costs',
'Additional_fees',
'Estimated_book_costs',
'Estimated_personal_spending',
'Pct_of_faculty_with_PhD',
'Pct_of_faculty_with_terminal_degree',
'Student_and_faculty_ratio',
'Pct_alumni_who_donate',
'Instructional_expenditure_per_student',
'Graduation_rate',
'Average_salary_full_professors',
'Average_salary_associate_professors',
'Average_salary_assistant_professors',
'Average_salary_all_ranks',
'Average_compensation_full_professors',
'Average_compensation_associate_professors',
'Average_compensation_assistant_professors',
'Average_compensation_all_ranks',
'Number_of_full_professors',
'Number_of_associate_professors',
'Number_of_assistant_professors',
'Number_of_instructors',
'Number_of_faculty_all_ranks',
'Type_I',
'Type_IIA',
'Type_IIB',
'Type_VIIB']
localhost:8888/lab 9/18
7/30/2020 Exploratory Data Analysis
In [14]:
df.head()
Out[14]:
5 rows × 50 columns
In [15]:
localhost:8888/lab 10/18
7/30/2020 Exploratory Data Analysis
In [16]:
localhost:8888/lab 11/18
7/30/2020 Exploratory Data Analysis
In [17]:
In [18]:
localhost:8888/lab 12/18
7/30/2020 Exploratory Data Analysis
In [19]:
In [20]:
localhost:8888/lab 13/18
7/30/2020 Exploratory Data Analysis
In [21]:
# It is apparent that in our data we have some outliers, let's proceed to remove these
outliers:
plt.figure(figsize=(12,12))
sns.boxplot(x="Public_indicator", y="Graduation_rate", data=df)
plt.savefig("grad_rate vs public_indicator", quality=95, dpi=300, bbox_inches="tight")
localhost:8888/lab 14/18
7/30/2020 Exploratory Data Analysis
In [22]:
# We now remove all examples which contain values more than 3 standard deviations away
from our mean:
df = df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3).all(axis=1)]
df.info()
localhost:8888/lab 15/18
7/30/2020 Exploratory Data Analysis
<class 'pandas.core.frame.DataFrame'>
Int64Index: 832 entries, 1 to 1104
Data columns (total 50 columns):
# Column Non-Null Count Dt
ype
--- ------ -------------- --
---
0 FICE 832 non-null in
t64
1 Public_indicator 832 non-null in
t64
2 Average_Math_SAT_score 832 non-null fl
oat64
3 Average_Verbal_SAT_score 832 non-null fl
oat64
4 Average_Combined_SAT_score 832 non-null fl
oat64
5 Average_ACT_score 832 non-null fl
oat64
6 First_quartile_Math_SAT 832 non-null fl
oat64
7 Third_quartile_Math_SAT 832 non-null fl
oat64
8 First_quartile_Verbal_SAT 832 non-null fl
oat64
9 Third_quartile_Verbal_SAT 832 non-null fl
oat64
10 First_quartile_ACT 832 non-null fl
oat64
11 Third_quartile_ACT 832 non-null fl
oat64
12 Number_applications_received 832 non-null fl
oat64
13 Number_applicants_accepted 832 non-null fl
oat64
14 Number_new_students_enrolled 832 non-null fl
oat64
15 new_students_from_top_ten_percent_HS_class 832 non-null fl
oat64
16 students_from_top_twenty_five_percent_of_HS_class 832 non-null fl
oat64
17 Number_fulltime_undergraduates 832 non-null fl
oat64
18 Number_parttime_undergraduates 832 non-null fl
oat64
19 In_state_tuition 832 non-null fl
oat64
20 Out_state_tuition 832 non-null fl
oat64
21 Room_and_board_costs 832 non-null fl
oat64
22 Room_costs 832 non-null fl
oat64
23 Board_costs 832 non-null fl
oat64
24 Additional_fees 832 non-null fl
oat64
25 Estimated_book_costs 832 non-null fl
oat64
26 Estimated_personal_spending 832 non-null fl
oat64
localhost:8888/lab 16/18
7/30/2020 Exploratory Data Analysis
localhost:8888/lab 17/18
7/30/2020 Exploratory Data Analysis
In [23]:
df.describe()
Out[23]:
8 rows × 50 columns
localhost:8888/lab 18/18