Basics of Statistical Learning

Published: May 23, 2024

ISL Chapter 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
np.set_printoptions(precision=2)

Basic Numpy Commands

x = np.array([1,2,3,4])

y = np.array([5,6,7,2])

x+y

z=np.array([[1,2],[3,4]])

print(z)

z.ndim

z.dtype

np.array([[1,2],[4,3]],float).dtype

z.shape

np.sum(z)

print(z)

d=np.array([1,2,3,4,56,5,7,2,43])
print(d)

d_reshape = d.reshape((3,3))
print(d_reshape)

print(d_reshape[1,2])

d_reshape[0,0]=4.5
print(d_reshape)

A_matrix = np.array(np.arange(16)).reshape((4,4))
A_matrix

np.sqrt(d_reshape)

d_reshape**2

r = np.random.normal()
r1 = np.random.normal(0,1,10)
print(f"{r} \n {r1}")

admin:ssh_signing_key
Full control of public user SSH signing keys

np.random.normal?

mu ,sigma = 0,0.1
s= np.random.normal(mu,sigma,10000)

count, bins,ignred = plt.hist(s,30,density=True)
plt.plot(bins,1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2)),linewidth=2,color='r')
plt.show()

o = np.random.normal(size=50)
#genarting  50  independ random variables from the N(0,1) the standerized normal distribution 
print (o)

p = o + np.random.normal(loc=50 ,scale= 1, size =50)
print(p)

np.corrcoef(p,o)

print(np.random.normal(scale=5, size =2))
print(np.random.normal(scale=5, size =2))

np.random.default_rng?

rng= np.random.default_rng(1303)
print(rng.normal(scale =5 ,size =2))
rng2= np.random.default_rng(1303)
print(rng2.normal(scale =5 ,size =2))

test = np.random.default_rng(3)
y= test.standard_normal(10)

print(np.mean(y)) # or y.mean()
print(y.var()) 

print(y.std())

X= test.standard_normal((10,3))
X

print(X.mean(axis=0)) # arrays are row major ordered  axis =0 means rows

x_test = np.array([[1,2,3],[5,3,1]])
print(x_test)
print(x_test.mean())
print(x_test.mean(axis=0)) # x.mean(0r)

Plotting(Graphics)

fig, (ax1,ax2) = plt.subplots(1,2) #using subplots to display multilple plots in a single figure 
ax1.plot(np.random.rand(10))
ax2.hist(np.random.randn(1000))

fig , ax = plt.subplots(figsize=(8,8))
rng = np.random.default_rng(1)
x= rng.standard_normal(100)
y=rng.standard_normal(100)
ax.plot(x,y,'o');

fig,ax = plt.subplots(figsize=(10,8))
ax.scatter(x,y,marker='o')
ax.set_xlabel("the x-axis")
ax.set_ylabel("the y-axis")
ax.set_title("plot test");

fig, axes = plt.subplots(nrows=2,ncols=3,figsize=(15,5))

axes[0,1].plot(x,y,'o') #targeting a specific
axes[1,2].scatter(x,y,marker="+")
fig

fig.savefig("Figure.png",dpi=400)

fig.savefig("Figure.pdf",dpi=200);

#fig , ax = plt.subplots(figsize=(8,8))
x= np.linspace(-np.pi,np.pi,50)
y=x
print(f"this is x: {x}")
print(f"this is y: {y}")

fig ,ax = plt.subplots(figsize=(8,8))
f= np.multiply.outer(np.cos(y),1/(1+x**2))
ax.contour(x,y,f);

fig ,ax = plt.subplots(figsize=(8,8))
ax.contour(x,y,f,levels=40);

fig ,ax = plt.subplots(figsize=(8,8))
ax.contour(x,y,f,levels=40);
fig = plt.figure()
ax3d =fig.add_subplot(111,projection='3d')
ax3d.contour3D(x,y,f,50,cmap='binary');

fig ,ax = plt.subplots(figsize=(8,8))
ax.imshow(f);

Indexing Data

Dataseq1= np.linspace(0,10,11)
seq1

#np.linspace?

seq2 = np.arange(0,11)
seq2

#np.arange?

"machine learning"[3:6]  #same as  "string"[slice(3,6)]

slice?

A_matrix = np.array(np.arange(16)).reshape((4,4))
print(A_matrix)
print(A_matrix[1,2])

A_matrix[[1,3]]  #selecting the second and forth rows

A_matrix[:,[0,3]]  #selecting the first and the forth columns

A_matrix[[0,2],[1,3]] # it doesnt give us the submatrix, only the elements on A[0,2] ,A[1,3]

A_matrix[[0,2]][:,[1,3]] # a way to get around it by subsetting rows and colums independently 

idx =np.ix_([0,2],[1,3])  # creating an intermediate arrays instead of us subsetting it manually 
print(idx)
A_matrix[idx]

A_matrix[1:4:2,0:2:1] #start:stop:step => selecting the second row till the forth row  with two steps ,
#the first column till the third column with one step 

boolean_rows = np.zeros(A_matrix.shape[0],bool)
boolean_rows

#np.zeros?

boolean_rows[[1,2]] = True
boolean_rows

np.all(boolean_rows == np.array([0,1,1,1]))

A_matrix

A_matrix[np.array([0,1,1,1])]

A_matrix[boolean_rows] #booleans and intergers are treated diffrently by numPy

boolean_cols = np.zeros(A_matrix.shape[1],bool)
boolean_cols [[1,3]]= True
idx_bool = np.ix_(boolean_rows,boolean_cols)
A_matrix[idx_bool] #another way to create a mesh or a submatrix of A_matrix

idx_mixed = np.ix_(boolean_rows,[1,3])
A_matrix[idx_mixed]

Loading Data

auto = pd.read_csv('Auto.csv')
auto

Auto = pd.read_csv('Auto.data' ,delim_whitespace=True)  # Reading file with fields separated with white space 
Auto

auto['horsepower']

np.unique(auto['horsepower']) # returns onyl the unique values (not repeated) in that field 
auto['horsepower'].sum()

#np.unique?

auto= pd.read_csv('Auto.data',na_values=['?'],sep=r'\s+') # replace the '?' with a NaN
np.unique(auto['horsepower'])

auto['horsepower'].sum()

auto.shape #397 observations n and  9 variables,fields or columns 

Auto_new = auto.dropna() # deletes the  missing observations, rows 
Auto_new.shape

auto = Auto_new
auto.columns

auto[:4]

idx_above_80 = auto['year']>80
print(idx_above_80)
auto[idx_above_80]

auto[['horsepower','year']]

auto.index

auto_mod = auto.set_index('name')
auto_mod

auto_mod.columns

auto_mod.loc[['ford ranger', 'chevy s-10']] # we indexed the names  and we can access the observations through them 

auto_mod.iloc[[3,5]]

auto.iloc[[3,5]]

auto_mod.iloc[:,[1,2]]

auto.iloc[:,[1,2]] # only the second and third columns without the name index

auto_mod.iloc[[3,4],[0,2,3]] # the forth and fifth  observation and the first, third and forth columns or variables 

auto_mod.loc['ford galaxie 500', ['year','mpg']]

idx_80 = auto_mod['year'] >80
auto_mod.loc[idx_80,['weight','origin']]

auto_mod.loc[lambda df: df['year']>80,['weight','origin']] # the lambda creats a function and returns with df['year']>80

auto_mod.loc[lambda df: (df['year']>80) & (df['mpg']>40),['weight','origin']]

auto_mod.loc[lambda df: (df['displacement']<300) &(df.index.str.contains('ford') | df.index.str.contains('datsun')),['weight', 'origin']]
# retreving  with a condition  <300 , and used the indexed names to check 

Auto = pd.read_csv('../data-sets/Auto.csv')
Auto

Auto.shape

np.unique(Auto['horsepower']) # there is some empty observations "?"

Auto = pd.read_csv('../data-sets/Auto.data', na_values=['?'],sep=r'\s+')
Auto['horsepower'].sum()

Auto_new = Auto.dropna()
Auto_new.shape

idx_80 = Auto_new['year'] >80
idx_80

Auto_new[idx_80]

Auto_new.iloc[[2,3]]

Auto_new.iloc[:,[0,1,4]]

For Loops

total = 0
for value in [2,3,19]:
    total +=value
print(total)

total = 0
for value in [1,10,20]:
    for weight in [50,60,70]:
        total +=value*weight
print(total)

total = 0
for event ,probability in zip([4,6,3],[0.2,0.7,0.1]):
    total +=event*probability
print(total)

zip?

print( zip([4,6,3],[0.2,0.7,0.1]))

names = ["Alice", "Bob", "Charlie", "Dave"]  # 4 elements
ages = [25, 30, 35]
zipped = zip(names ,ages)
print(zipped)
print(list(zipped))

rng = np.random.default_rng(1)
A = rng.standard_normal((127,5))
A

M = rng.choice([0, np.nan], p=[0.8,0.2],size=A.shape)
M

A +=M
print(A)
A.shape

D= pd.DataFrame(A,columns=['gpa','speed','defense','agility','armor'])

D[:4]

for col in D.columns: 
    template ='Column "{0}" has {1:.2%} missing values'
    print(template.format(col,np.isnan(D[col]).mean()))

Graphical and Numerical Summaries

Auto = pd.read_csv("../data-sets/Auto.csv")

fig, ax =plt.subplots(figsize=(8,8))
ax.plot(Auto['horsepower'],Auto['mpg'],'o');

ax =Auto.plot.scatter('horsepower','mpg');
ax.set_title('Horsepower vs. MPG')

ax.figure

figure = ax.figure 
figure.savefig('../figures/horsepower_mpg.png')

fig ,axes = plt.subplots(ncols=2,figsize=(15,5))
Auto.plot.scatter('horsepower','mpg', ax =axes[1])
Auto.plot.scatter('horsepower','year',ax=axes[0])

Auto.horsepower.dtype
Auto.cylinders.dtype
Auto

Auto.cylinders = pd.Series(Auto.cylinders , dtype='category') # cylinders have only a small number of possible values 
# they can be turned into categories using pd.Series()
Auto.cylinders

fig, ax = plt.subplots(figsize=(8,8))
Auto.boxplot('mpg',by='cylinders',ax=ax);

fig , axes = plt.subplots(ncols=3,figsize=(8,8))
Auto.hist('mpg',color='red',bins=9 ,ax=axes[0]);
Auto.hist('mpg',color='blue',bins=20,ax=axes[1]);
Auto.hist('mpg',color='green',bins=50,ax=axes[2]);

Auto.mpg

pd.plotting.scatter_matrix(Auto);

pd.plotting.scatter_matrix(Auto[['mpg','displacement','weight']]);

Auto[['mpg','weight']].describe()

Auto['mpg'].describe()

Share on

Bluesky Facebook LinkedIn X (formerly Twitter)

WOOJIN PARK

ISL Chapter 2

Basic Numpy Commands

Plotting(Graphics)

Indexing Data

Loading Data

For Loops

Graphical and Numerical Summaries

Share on