Basics of Statistical Learning
Published:
ISL Chapter 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
np.set_printoptions(precision=2)
Basic Numpy Commands
x = np.array([1,2,3,4])
y = np.array([5,6,7,2])
x+y
z=np.array([[1,2],[3,4]])
print(z)
z.ndim
z.dtype
np.array([[1,2],[4,3]],float).dtype
z.shape
np.sum(z)
print(z)
d=np.array([1,2,3,4,56,5,7,2,43])
print(d)
d_reshape = d.reshape((3,3))
print(d_reshape)
print(d_reshape[1,2])
d_reshape[0,0]=4.5
print(d_reshape)
A_matrix = np.array(np.arange(16)).reshape((4,4))
A_matrix
np.sqrt(d_reshape)
d_reshape**2
r = np.random.normal()
r1 = np.random.normal(0,1,10)
print(f"{r} \n {r1}")
admin:ssh_signing_key
Full control of public user SSH signing keys
np.random.normal?
mu ,sigma = 0,0.1
s= np.random.normal(mu,sigma,10000)
count, bins,ignred = plt.hist(s,30,density=True)
plt.plot(bins,1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2)),linewidth=2,color='r')
plt.show()
o = np.random.normal(size=50)
#genarting 50 independ random variables from the N(0,1) the standerized normal distribution
print (o)
p = o + np.random.normal(loc=50 ,scale= 1, size =50)
print(p)
np.corrcoef(p,o)
print(np.random.normal(scale=5, size =2))
print(np.random.normal(scale=5, size =2))
np.random.default_rng?
rng= np.random.default_rng(1303)
print(rng.normal(scale =5 ,size =2))
rng2= np.random.default_rng(1303)
print(rng2.normal(scale =5 ,size =2))
test = np.random.default_rng(3)
y= test.standard_normal(10)
print(np.mean(y)) # or y.mean()
print(y.var())
print(y.std())
X= test.standard_normal((10,3))
X
print(X.mean(axis=0)) # arrays are row major ordered axis =0 means rows
x_test = np.array([[1,2,3],[5,3,1]])
print(x_test)
print(x_test.mean())
print(x_test.mean(axis=0)) # x.mean(0r)
Plotting(Graphics)
fig, (ax1,ax2) = plt.subplots(1,2) #using subplots to display multilple plots in a single figure
ax1.plot(np.random.rand(10))
ax2.hist(np.random.randn(1000))
fig , ax = plt.subplots(figsize=(8,8))
rng = np.random.default_rng(1)
x= rng.standard_normal(100)
y=rng.standard_normal(100)
ax.plot(x,y,'o');
fig,ax = plt.subplots(figsize=(10,8))
ax.scatter(x,y,marker='o')
ax.set_xlabel("the x-axis")
ax.set_ylabel("the y-axis")
ax.set_title("plot test");
fig, axes = plt.subplots(nrows=2,ncols=3,figsize=(15,5))
axes[0,1].plot(x,y,'o') #targeting a specific
axes[1,2].scatter(x,y,marker="+")
fig
fig.savefig("Figure.png",dpi=400)
fig.savefig("Figure.pdf",dpi=200);
#fig , ax = plt.subplots(figsize=(8,8))
x= np.linspace(-np.pi,np.pi,50)
y=x
print(f"this is x: {x}")
print(f"this is y: {y}")
fig ,ax = plt.subplots(figsize=(8,8))
f= np.multiply.outer(np.cos(y),1/(1+x**2))
ax.contour(x,y,f);
fig ,ax = plt.subplots(figsize=(8,8))
ax.contour(x,y,f,levels=40);
fig ,ax = plt.subplots(figsize=(8,8))
ax.contour(x,y,f,levels=40);
fig = plt.figure()
ax3d =fig.add_subplot(111,projection='3d')
ax3d.contour3D(x,y,f,50,cmap='binary');
fig ,ax = plt.subplots(figsize=(8,8))
ax.imshow(f);
Indexing Data
Dataseq1= np.linspace(0,10,11)
seq1
#np.linspace?
seq2 = np.arange(0,11)
seq2
#np.arange?
"machine learning"[3:6] #same as "string"[slice(3,6)]
slice?
A_matrix = np.array(np.arange(16)).reshape((4,4))
print(A_matrix)
print(A_matrix[1,2])
A_matrix[[1,3]] #selecting the second and forth rows
A_matrix[:,[0,3]] #selecting the first and the forth columns
A_matrix[[0,2],[1,3]] # it doesnt give us the submatrix, only the elements on A[0,2] ,A[1,3]
A_matrix[[0,2]][:,[1,3]] # a way to get around it by subsetting rows and colums independently
idx =np.ix_([0,2],[1,3]) # creating an intermediate arrays instead of us subsetting it manually
print(idx)
A_matrix[idx]
A_matrix[1:4:2,0:2:1] #start:stop:step => selecting the second row till the forth row with two steps ,
#the first column till the third column with one step
boolean_rows = np.zeros(A_matrix.shape[0],bool)
boolean_rows
#np.zeros?
boolean_rows[[1,2]] = True
boolean_rows
np.all(boolean_rows == np.array([0,1,1,1]))
A_matrix
A_matrix[np.array([0,1,1,1])]
A_matrix[boolean_rows] #booleans and intergers are treated diffrently by numPy
boolean_cols = np.zeros(A_matrix.shape[1],bool)
boolean_cols [[1,3]]= True
idx_bool = np.ix_(boolean_rows,boolean_cols)
A_matrix[idx_bool] #another way to create a mesh or a submatrix of A_matrix
idx_mixed = np.ix_(boolean_rows,[1,3])
A_matrix[idx_mixed]
Loading Data
auto = pd.read_csv('Auto.csv')
auto
Auto = pd.read_csv('Auto.data' ,delim_whitespace=True) # Reading file with fields separated with white space
Auto
auto['horsepower']
np.unique(auto['horsepower']) # returns onyl the unique values (not repeated) in that field
auto['horsepower'].sum()
#np.unique?
auto= pd.read_csv('Auto.data',na_values=['?'],sep=r'\s+') # replace the '?' with a NaN
np.unique(auto['horsepower'])
auto['horsepower'].sum()
auto.shape #397 observations n and 9 variables,fields or columns
Auto_new = auto.dropna() # deletes the missing observations, rows
Auto_new.shape
auto = Auto_new
auto.columns
auto[:4]
idx_above_80 = auto['year']>80
print(idx_above_80)
auto[idx_above_80]
auto[['horsepower','year']]
auto.index
auto_mod = auto.set_index('name')
auto_mod
auto_mod.columns
auto_mod.loc[['ford ranger', 'chevy s-10']] # we indexed the names and we can access the observations through them
auto_mod.iloc[[3,5]]
auto.iloc[[3,5]]
auto_mod.iloc[:,[1,2]]
auto.iloc[:,[1,2]] # only the second and third columns without the name index
auto_mod.iloc[[3,4],[0,2,3]] # the forth and fifth observation and the first, third and forth columns or variables
auto_mod.loc['ford galaxie 500', ['year','mpg']]
idx_80 = auto_mod['year'] >80
auto_mod.loc[idx_80,['weight','origin']]
auto_mod.loc[lambda df: df['year']>80,['weight','origin']] # the lambda creats a function and returns with df['year']>80
auto_mod.loc[lambda df: (df['year']>80) & (df['mpg']>40),['weight','origin']]
auto_mod.loc[lambda df: (df['displacement']<300) &(df.index.str.contains('ford') | df.index.str.contains('datsun')),['weight', 'origin']]
# retreving with a condition <300 , and used the indexed names to check
Auto = pd.read_csv('../data-sets/Auto.csv')
Auto
Auto.shape
np.unique(Auto['horsepower']) # there is some empty observations "?"
Auto = pd.read_csv('../data-sets/Auto.data', na_values=['?'],sep=r'\s+')
Auto['horsepower'].sum()
Auto_new = Auto.dropna()
Auto_new.shape
idx_80 = Auto_new['year'] >80
idx_80
Auto_new[idx_80]
Auto_new.iloc[[2,3]]
Auto_new.iloc[:,[0,1,4]]
For Loops
total = 0
for value in [2,3,19]:
total +=value
print(total)
total = 0
for value in [1,10,20]:
for weight in [50,60,70]:
total +=value*weight
print(total)
total = 0
for event ,probability in zip([4,6,3],[0.2,0.7,0.1]):
total +=event*probability
print(total)
zip?
print( zip([4,6,3],[0.2,0.7,0.1]))
names = ["Alice", "Bob", "Charlie", "Dave"] # 4 elements
ages = [25, 30, 35]
zipped = zip(names ,ages)
print(zipped)
print(list(zipped))
rng = np.random.default_rng(1)
A = rng.standard_normal((127,5))
A
M = rng.choice([0, np.nan], p=[0.8,0.2],size=A.shape)
M
A +=M
print(A)
A.shape
D= pd.DataFrame(A,columns=['gpa','speed','defense','agility','armor'])
D
D[:4]
for col in D.columns:
template ='Column "{0}" has {1:.2%} missing values'
print(template.format(col,np.isnan(D[col]).mean()))
Graphical and Numerical Summaries
Auto = pd.read_csv("../data-sets/Auto.csv")
fig, ax =plt.subplots(figsize=(8,8))
ax.plot(Auto['horsepower'],Auto['mpg'],'o');
ax =Auto.plot.scatter('horsepower','mpg');
ax.set_title('Horsepower vs. MPG')
ax.figure
figure = ax.figure
figure.savefig('../figures/horsepower_mpg.png')
fig ,axes = plt.subplots(ncols=2,figsize=(15,5))
Auto.plot.scatter('horsepower','mpg', ax =axes[1])
Auto.plot.scatter('horsepower','year',ax=axes[0])
Auto.horsepower.dtype
Auto.cylinders.dtype
Auto
Auto.cylinders = pd.Series(Auto.cylinders , dtype='category') # cylinders have only a small number of possible values
# they can be turned into categories using pd.Series()
Auto.cylinders
fig, ax = plt.subplots(figsize=(8,8))
Auto.boxplot('mpg',by='cylinders',ax=ax);
fig , axes = plt.subplots(ncols=3,figsize=(8,8))
Auto.hist('mpg',color='red',bins=9 ,ax=axes[0]);
Auto.hist('mpg',color='blue',bins=20,ax=axes[1]);
Auto.hist('mpg',color='green',bins=50,ax=axes[2]);
Auto.mpg
pd.plotting.scatter_matrix(Auto);
pd.plotting.scatter_matrix(Auto[['mpg','displacement','weight']]);
Auto[['mpg','weight']].describe()
Auto['mpg'].describe()
