week-1
January 29, 2024
#Week - 1
##1. Extract data from different file formats and display the summary statistics.
[ ]: import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
##Extraction from CSV file
[ ]: files = pd.read_csv("/content/sample_data/california_housing_train.csv")
[ ]: files.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 17000 non-null float64
1 latitude 17000 non-null float64
2 housing_median_age 17000 non-null float64
3 total_rooms 17000 non-null float64
4 total_bedrooms 17000 non-null float64
5 population 17000 non-null float64
6 households 17000 non-null float64
7 median_income 17000 non-null float64
8 median_house_value 17000 non-null float64
dtypes: float64(9)
memory usage: 1.2 MB
[ ]: files.describe()
[ ]: longitude latitude housing_median_age total_rooms \
count 17000.000000 17000.000000 17000.000000 17000.000000
mean -119.562108 35.625225 28.589353 2643.664412
std 2.005166 2.137340 12.586937 2179.947071
min -124.350000 32.540000 1.000000 2.000000
1
25% -121.790000 33.930000 18.000000 1462.000000
50% -118.490000 34.250000 29.000000 2127.000000
75% -118.000000 37.720000 37.000000 3151.250000
max -114.310000 41.950000 52.000000 37937.000000
total_bedrooms population households median_income \
count 17000.000000 17000.000000 17000.000000 17000.000000
mean 539.410824 1429.573941 501.221941 3.883578
std 421.499452 1147.852959 384.520841 1.908157
min 1.000000 3.000000 1.000000 0.499900
25% 297.000000 790.000000 282.000000 2.566375
50% 434.000000 1167.000000 409.000000 3.544600
75% 648.250000 1721.000000 605.250000 4.767000
max 6445.000000 35682.000000 6082.000000 15.000100
median_house_value
count 17000.000000
mean 207300.912353
std 115983.764387
min 14999.000000
25% 119400.000000
50% 180400.000000
75% 265000.000000
max 500001.000000
[ ]: sns.boxplot(files["median_house_value"])
[ ]: <Axes: >
2
##Extraction from JSON file
[ ]: import json
[ ]: sample = json.load(open("/content/sample_data/anscombe.json"))
[ ]: sample = pd.DataFrame(sample)
[ ]: sample.describe()
[ ]: X Y
count 44.000000 44.000000
mean 9.000000 7.500455
std 3.198837 1.959244
min 4.000000 3.100000
25% 7.000000 6.117500
50% 8.000000 7.520000
75% 11.000000 8.747500
max 19.000000 12.740000
[ ]: sns.boxplot(sample)
3
[ ]: <Axes: >
##Extracting from MarkDown file
[ ]: import markdown as md
from bs4 import BeautifulSoup as bs
html = md.markdown(open("/content/sample_data/README.md").read())
print("".join(bs(html).findAll(text=True)))
This directory includes a few sample datasets to get you started.
california_housing_data*.csv is California housing data from the 1990 US
Census; more information is available at:
https://developers.google.com/machine-learning/crash-course/california-
housing-data-description
mnist_*.csv is a small sample of the
MNIST database, which is
described at: http://yann.lecun.com/exdb/mnist/
4
anscombe.json contains a copy of
Anscombe's quartet; it
was originally described in
Anscombe, F. J. (1973). 'Graphs in Statistical Analysis'. American
Statistician. 27 (1): 17-21. JSTOR 2682899.
and our copy was prepared by the
vega_datasets library.
<ipython-input-53-6b35d5dcf2d9>:4: DeprecationWarning: The 'text' argument to
find()-type methods is deprecated. Use 'string' instead.
print("".join(bs(html).findAll(text=True)))
##Extraction from Excel file
[ ]: ext = pd.read_excel("/content/drive/MyDrive/SAMPLES.xlsx")
[ ]: ext.describe()
[ ]: Amount Profit Quantity
count 19.000000 19.000000 19.000000
mean 1653.263158 146.684211 5.473684
std 1556.337825 527.299625 2.988281
min 6.000000 -891.000000 1.000000
25% 253.000000 -201.500000 4.000000
50% 1854.000000 1.000000 5.000000
75% 2430.500000 545.500000 6.500000
max 5729.000000 1151.000000 14.000000
[ ]: sns.boxplot(ext["Profit"])
[ ]: <Axes: >
5
[ ]: