csci5612 data prep

Data Preparation and Exploratory Data Analysis

Data Sets

https://www.kaggle.com/datasets/arshkon/linkedin-job-postings

500mb, 100k rows
scraped job postings from linkedin from 2023-2024
columns: company_name, company_employee_count, company_follower_count, title, description, max_salary, location, views

https://www.kaggle.com/datasets/ayushtankha/70k-job-applicants-data-human-resource

13mb, 70k rows
stackoverflow survey results from developer job applicants from 2023
columns: age, gender, ed_level, years_code_pro, prev_salary, employed

Obtaining the Data

code_get.py


import os
import io
import zipfile
import requests
import pandas as pd

def get_kaggle_data():
    df = None
    api_endpoint = "https://www.kaggle.com/api/v1/datasets/download/ayushtankha/70k-job-applicants-data-human-resource"
    datafile = "stackoverflow.csv"
    datasize = 13000000
    if os.path.exists(datafile) and os.path.getsize(datafile) > datasize:
        print("loading from %s" % datafile)
        df = pd.read_csv(datafile)
    else:
        response = requests.get(api_endpoint, stream=True)
        response.raise_for_status()
        with zipfile.ZipFile(io.BytesIO(response.content), "r") as zf:
            filenames = zf.namelist()
            fname = None
            for filename in filenames:
                if ".csv" in filename:
                    fname = filename
            if not fname:
                raise Exception("couldn't find csv in zip")
            print("extracting %s from %s" % (fname, api_endpoint))
            with zf.open(fname, "r") as tf:
                csv_text = tf.read().decode("utf-8")
        with open(datafile, "w") as csv:
            csv.write(csv_text)
            csv.flush()
        df = pd.read_csv(io.StringIO(csv_text))
    return df

if __name__ == "__main__":
    df = get_kaggle_data()
    print(df.columns)

code_get.py output


loading from stackoverflow.csv
Index(['Unnamed: 0', 'Age', 'Accessibility', 'EdLevel', 'Employment', 'Gender',
       'MentalHealth', 'MainBranch', 'YearsCode', 'YearsCodePro', 'Country',
       'PreviousSalary', 'HaveWorkedWith', 'ComputerSkills', 'Employed'],
      dtype='object')

Exploring the Data

code_raw_vis.py


from code_get import get_kaggle_data
import pandas as pd
import matplotlib.pyplot as plt

def make_bar_age(age_col, output_png):
    ages = list(set(age_col))
    counts = list(len(age_col[age_col == age]) for age in ages)
    plt.figure(figsize=(8,6))
    plt.bar(ages, counts)
    plt.title("Number of Applicants Comparison By Age")
    plt.xlabel("Age (years)")
    plt.ylabel("Count")
    plt.savefig(output_png)

def make_bar_gender(gender_col, output_png):
    genders = list(set(gender_col))
    counts = list(len(gender_col[gender_col == g]) for g in genders)
    plt.figure(figsize=(8,6))
    plt.bar(genders, counts)
    plt.title("Number of Applicants Comparison By Gender")
    plt.xlabel("Gender")
    plt.ylabel("Count")
    plt.savefig(output_png)

def make_bar_dev(dev_col, output_png):
    devtypes = list(set(dev_col))
    counts = list(len(dev_col[dev_col == d]) for d in devtypes)
    plt.figure(figsize=(8,6))
    plt.bar(devtypes, counts)
    plt.title("Number of Applicants Comparison By Developer Status")
    plt.xlabel("Developer Status")
    plt.ylabel("Count")
    plt.savefig(output_png)

def make_histogram_years_code(years_code_col, output_png):
    plt.figure(figsize=(8,6))
    plt.hist(x=years_code_col, bins=25)
    plt.ylim((0,13000))
    plt.title("Applicant Distribution of Years Experience Coding")
    plt.xlabel("Years of Experience Coding")
    plt.ylabel("Count")
    plt.savefig(output_png)

def make_histogram_years_code_pro(years_code_col, output_png):
    plt.figure(figsize=(8,6))
    plt.hist(x=years_code_col, bins=25)
    plt.ylim((0,13000))
    plt.title("Applicant Distribution of Professional Years Experience Coding")
    plt.xlabel("Professional Years of Experience Coding")
    plt.ylabel("Count")
    plt.savefig(output_png)

def make_histogram_salary(salary_col, output_png):
    plt.figure(figsize=(8,6))
    plt.hist(x=salary_col, bins=25)
    plt.title("Applicant Distribution of Previous Job's Salary")
    plt.xlabel("Previous Salary ($)")
    plt.ylabel("Count")
    plt.savefig(output_png)

def make_histogram_skills(skills_col, output_png):
    plt.figure(figsize=(8,6))
    plt.hist(x=skills_col, bins=50)
    plt.title("Applicant Distribution of Number of Computer Skills Listed")
    plt.xlabel("Applicant Number of Listed Computer Skills")
    plt.ylabel("Count")
    plt.savefig(output_png)

def make_scatter_skill_salary(skills_col, salary_col, output_png):
    plt.figure(figsize=(8,6))
    plt.scatter(skills_col, salary_col, s=4)
    plt.title("Applicant Number of Skills vs Previous Job's Salary")
    plt.xlabel("Applicant Number of Listed Computer Skills")
    plt.ylabel("Salary ($)")
    plt.savefig(output_png)

def make_scatter_experience_salary(experience_col, salary_col, output_png):
    plt.figure(figsize=(8,6))
    plt.scatter(experience_col, salary_col, s=4)
    plt.title("Applicant Professional Years of Coding Experience vs Previous Job's Salary")
    plt.xlabel("Applicant Professional Years of Coding Experience")
    plt.ylabel("Salary ($)")
    plt.savefig(output_png)

def make_plot_experience_median_salary(experience_col, salary_col, output_png):
    min_year = min(experience_col)
    max_year = max(experience_col)
    years_exp = list()
    median_salaries = list()
    for y in range(min_year, max_year+1):
        year_salaries = salary_col[experience_col == y]
        years_exp.append(y)
        median_salaries.append(year_salaries.median())
    plt.figure(figsize=(8,6))
    plt.plot(years_exp, median_salaries)
    plt.title("Applicant Professional Years of Coding Experience vs Median Previous Job's Salary")
    plt.xlabel("Applicant Professional Years of Coding Experience")
    plt.ylabel("Median Salary ($)")
    plt.savefig(output_png)


if __name__ == "__main__":
    df = get_kaggle_data()

    print("making bar chart of age")
    make_bar_age(df["Age"], "raw_bar_age.png")

    print("making bar chart of gender")
    make_bar_gender(df["Gender"], "raw_bar_gender.png")

    print("making bar chart of developer types")
    make_bar_dev(df["MainBranch"], "raw_bar_dev.png")

    print("making histogram of years coding")
    make_histogram_years_code(df["YearsCode"], "raw_hist_years_coding.png")

    print("making histogram of professional years coding")
    make_histogram_years_code_pro(df["YearsCodePro"], "raw_hist_years_coding_pro.png")

    print("making histogram of previous salary")
    make_histogram_salary(df["PreviousSalary"], "raw_hist_salary.png")

    print("making histogram of computer skills")
    make_histogram_skills(df["ComputerSkills"], "raw_hist_skills.png")

    print("making scatter of skills vs previous salary")
    make_scatter_skill_salary(df["ComputerSkills"], df["PreviousSalary"], "raw_scatter_skill_salary.png")

    print("making scatter of pro years exp vs previous salary")
    make_scatter_experience_salary(df["YearsCodePro"], df["PreviousSalary"], "raw_scatter_experience_salary.png")

    print("making plot of pro years exp vs median previous salary")
    make_plot_experience_median_salary(df["YearsCodePro"], df["PreviousSalary"], "raw_plot_experience_median_salary.png")

Comments:

The age of participants is well balanced between the 2 categories of younger and older than 35. The ratio of men vs other genders present in the survey is vast. Majority (92%) of the respondents considered themselves professional developers. There is a clear visible shift in the distributions when looking at the general years of experience coding vs professional years of experience indicating most people started coding before they became professional. There seems to be a wide range in the number of computer skills a person may list but the salary ranges are completely spanned from the low end to the very top end regardless of whether no or very few computer skills were listed or not. The scatter plot of pro years experience show that above 40 years is very rare which is further indicated by the drop in the plot of median salaries after 40 years of experience.

Cleaning the Data

code_clean.py


from code_get import get_kaggle_data

def clean_data(df):
    print("number of records: %d" % len(df))
    print("removing unused columns: 'Accessibility', 'HaveWorkedWith', 'Employment'")
    df2 = df.drop(columns=["Unnamed: 0"])
    df2 = df2.drop(columns=["Accessibility", "HaveWorkedWith", "Employment"])

    print("removing rows of non-devs")
    df2 = df2[df2["MainBranch"] == "Dev"]

    print("removing outliers of >42 years pro experience")
    df2 = df2[df2["YearsCodePro"] <= 42]

    return df2

if __name__ == "__main__":
    df = get_kaggle_data()
    print("number of nulls in each col:")
    print(df.isnull().sum())
    print()

    clean_df = clean_data(df)
    print("cleaned data set number of records: %d" % len(clean_df))
    clean_df.to_csv("stackoverflow_clean.csv")
    #print(clean_df)

code_clean.py output


loading from stackoverflow.csv
number of nulls in each col:
Unnamed: 0         0
Age                0
Accessibility      0
EdLevel            0
Employment         0
Gender             0
MentalHealth       0
MainBranch         0
YearsCode          0
YearsCodePro       0
Country            0
PreviousSalary     0
HaveWorkedWith    63
ComputerSkills     0
Employed           0
dtype: int64

number of records: 73462
removing unused columns: 'Accessibility', 'HaveWorkedWith', 'Employment'
removing rows of non-devs
removing outliers of >42 years pro experience
cleaned data set number of records: 67265

Visuals of the Cleaned Data

code_new_vis.py


from code_get import get_kaggle_data
from code_clean import clean_data

from code_raw_vis import (
    make_bar_age,
    make_bar_gender,
    make_bar_dev,
    make_histogram_years_code,
    make_histogram_years_code_pro,
    make_histogram_salary,
    make_histogram_skills,
    make_scatter_skill_salary,
    make_scatter_experience_salary,
    make_plot_experience_median_salary,
)

if __name__ == "__main__":
    df = get_kaggle_data()
    clean_df = clean_data(df)

    print("making bar chart of age")
    make_bar_age(clean_df["Age"], "new_bar_age.png")

    print("making bar chart of gender")
    make_bar_gender(clean_df["Gender"], "new_bar_gender.png")

    print("making bar chart of developer types")
    make_bar_dev(clean_df["MainBranch"], "new_bar_dev.png")

    print("making histogram of years coding")
    make_histogram_years_code(clean_df["YearsCode"], "new_hist_years_coding.png")

    print("making histogram of professional years coding")
    make_histogram_years_code_pro(clean_df["YearsCodePro"], "new_hist_years_coding_pro.png")

    print("making histogram of previous salary")
    make_histogram_salary(clean_df["PreviousSalary"], "new_hist_salary.png")

    print("making histogram of computer skills")
    make_histogram_skills(clean_df["ComputerSkills"], "new_hist_skills.png")

    print("making scatter of skills vs previous salary")
    make_scatter_skill_salary(clean_df["ComputerSkills"], clean_df["PreviousSalary"], "new_scatter_skill_salary.png")

    print("making scatter of pro years exp vs previous salary")
    make_scatter_experience_salary(clean_df["YearsCodePro"], clean_df["PreviousSalary"], "new_scatter_experience_salary.png")

    print("making plot of pro years exp vs median previous salary")
    make_plot_experience_median_salary(clean_df["YearsCodePro"], clean_df["PreviousSalary"], "new_plot_experience_median_salary.png")