#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Mar 7 22:07:56 2021 @author: ckuemmerle """ import pandas as pd from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt df= pd.read_csv('Salaries.csv') # load data into a "DataFrame" data structure of the pandas package) catA=df['discipline']=='A' # find indices that correspond to discipline == 'A' dfA = df.loc[catA] # select rows of DataFrame "df" whose discipline value is 'A', assign these rows to new DataFrame "dfA" dfB = df.loc[catA==False] # select rows of DataFrame "df" whose discipline value is 'B', assign these rows to new DataFrame "dfB" #fig = plt.figure(figsize=(15,5)) ax1 = dfA.plot.scatter(x='yrs.since.phd',y='salary',c='red',label='A',legend=True) # create scatter plot with 'yrs.since.phd' vs. 'salary' from the data with category 'A' dfB.plot.scatter(x='yrs.since.phd',y='salary',c='blue',label='B',ax=ax1,legend=True) # create scatter plot with 'yrs.since.phd' vs. 'salary' from the data with category 'B' (use same 'ax' as previous plot, so that this is added to previous plot window) feature_col = ['yrs.since.phd'] XA = dfA.loc[:,feature_col] # select data of column 'yrs.since.phd' from category 'A' data (regressors) XB = dfB.loc[:,feature_col] # select data of column 'yrs.since.phd' from category 'B' data (regressors) yA = dfA.salary # select data of column 'salary' from category 'A' data (dependent variable of regression) yB = dfB.salary # select data of column 'salary' from category 'A' data (dependent variable of regression) regression_catA = LinearRegression().fit(XA, yA) # solve linear regression on category 'A' data regression_catB = LinearRegression().fit(XB, yB) # solve linear regression on category 'B' data yA_regressed = regression_catA.intercept_+ regression_catA.coef_*XA # define regression line yB_regressed = regression_catB.intercept_+ regression_catB.coef_*XB ax1.plot(XA,yA_regressed,c='red') # plot regression line for category 'A' data ax1.plot(XB,yB_regressed,c='blue') # plot regression line for category 'B' data fig = plt.gcf() fig.set_size_inches(11,8) # adapt the figure size to make it look nice """ The following code reveals some further ways of understanding the datasets: """ pd.plotting.scatter_matrix(df) # plots several scatterplots and histograms fig = plt.gcf() fig.set_size_inches(11,8) # adapt the figure size to make it look nice display(df) description = df.describe() print(description) # compute and display some statistical quantities related to data set