#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Mar  7 22:07:56 2021

@author: ckuemmerle
"""

import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

df= pd.read_csv('Salaries.csv')  # load data into a "DataFrame" data structure of the pandas package)

catA=df['discipline']=='A'  # find indices that correspond to discipline == 'A'
dfA = df.loc[catA] # select rows of DataFrame "df" whose discipline value is 'A', assign these rows to new DataFrame "dfA"
dfB = df.loc[catA==False]  # select rows of DataFrame "df" whose discipline value is 'B', assign these rows to new DataFrame "dfB"

#fig = plt.figure(figsize=(15,5))
ax1 = dfA.plot.scatter(x='yrs.since.phd',y='salary',c='red',label='A',legend=True) # create scatter plot with 'yrs.since.phd' vs. 'salary' from the data with category 'A'
dfB.plot.scatter(x='yrs.since.phd',y='salary',c='blue',label='B',ax=ax1,legend=True) # create scatter plot with 'yrs.since.phd' vs. 'salary' from the data with category 'B' (use same 'ax' as previous plot, so that this is added to previous plot window)

feature_col = ['yrs.since.phd']
XA = dfA.loc[:,feature_col] # select data of column 'yrs.since.phd' from category 'A' data (regressors)
XB = dfB.loc[:,feature_col] # select data of column 'yrs.since.phd' from category 'B' data (regressors)

yA = dfA.salary # select data of column 'salary' from category 'A' data (dependent variable of regression)
yB = dfB.salary # select data of column 'salary' from category 'A' data (dependent variable of regression)

regression_catA = LinearRegression().fit(XA, yA) # solve linear regression on category 'A' data
regression_catB = LinearRegression().fit(XB, yB) # solve linear regression on category 'B' data

yA_regressed = regression_catA.intercept_+ regression_catA.coef_*XA # define regression line
yB_regressed = regression_catB.intercept_+ regression_catB.coef_*XB

ax1.plot(XA,yA_regressed,c='red') # plot regression line for category 'A' data
ax1.plot(XB,yB_regressed,c='blue') # plot regression line for category 'B' data

fig = plt.gcf()
fig.set_size_inches(11,8) # adapt the figure size to make it look nice
"""
The following code reveals some further ways of understanding the datasets:
"""
pd.plotting.scatter_matrix(df) # plots several scatterplots and histograms
fig = plt.gcf()
fig.set_size_inches(11,8) # adapt the figure size to make it look nice
display(df)
description = df.describe()
print(description) # compute and display some statistical quantities related to data set