https://www.kaggle.com/datasets/arshkon/linkedin-job-postings
https://www.kaggle.com/datasets/ayushtankha/70k-job-applicants-data-human-resource
code_get.py
import osimport ioimport zipfileimport requestsimport pandas as pddef get_kaggle_data():df = Noneapi_endpoint = "https://www.kaggle.com/api/v1/datasets/download/ayushtankha/70k-job-applicants-data-human-resource"datafile = "stackoverflow.csv"datasize = 13000000if os.path.exists(datafile) and os.path.getsize(datafile) > datasize:print("loading from %s" % datafile)df = pd.read_csv(datafile)else:response = requests.get(api_endpoint, stream=True)response.raise_for_status()with zipfile.ZipFile(io.BytesIO(response.content), "r") as zf:filenames = zf.namelist()fname = Nonefor filename in filenames:if ".csv" in filename:fname = filenameif not fname:raise Exception("couldn't find csv in zip")print("extracting %s from %s" % (fname, api_endpoint))with zf.open(fname, "r") as tf:csv_text = tf.read().decode("utf-8")with open(datafile, "w") as csv:csv.write(csv_text)csv.flush()df = pd.read_csv(io.StringIO(csv_text))return dfif __name__ == "__main__":df = get_kaggle_data()print(df.columns)
code_get.py output
loading from stackoverflow.csvIndex(['Unnamed: 0', 'Age', 'Accessibility', 'EdLevel', 'Employment', 'Gender','MentalHealth', 'MainBranch', 'YearsCode', 'YearsCodePro', 'Country','PreviousSalary', 'HaveWorkedWith', 'ComputerSkills', 'Employed'],dtype='object')
code_raw_vis.py
from code_get import get_kaggle_dataimport pandas as pdimport matplotlib.pyplot as pltdef make_bar_age(age_col, output_png):ages = list(set(age_col))counts = list(len(age_col[age_col == age]) for age in ages)plt.figure(figsize=(8,6))plt.bar(ages, counts)plt.title("Number of Applicants Comparison By Age")plt.xlabel("Age (years)")plt.ylabel("Count")plt.savefig(output_png)def make_bar_gender(gender_col, output_png):genders = list(set(gender_col))counts = list(len(gender_col[gender_col == g]) for g in genders)plt.figure(figsize=(8,6))plt.bar(genders, counts)plt.title("Number of Applicants Comparison By Gender")plt.xlabel("Gender")plt.ylabel("Count")plt.savefig(output_png)def make_bar_dev(dev_col, output_png):devtypes = list(set(dev_col))counts = list(len(dev_col[dev_col == d]) for d in devtypes)plt.figure(figsize=(8,6))plt.bar(devtypes, counts)plt.title("Number of Applicants Comparison By Developer Status")plt.xlabel("Developer Status")plt.ylabel("Count")plt.savefig(output_png)def make_histogram_years_code(years_code_col, output_png):plt.figure(figsize=(8,6))plt.hist(x=years_code_col, bins=25)plt.ylim((0,13000))plt.title("Applicant Distribution of Years Experience Coding")plt.xlabel("Years of Experience Coding")plt.ylabel("Count")plt.savefig(output_png)def make_histogram_years_code_pro(years_code_col, output_png):plt.figure(figsize=(8,6))plt.hist(x=years_code_col, bins=25)plt.ylim((0,13000))plt.title("Applicant Distribution of Professional Years Experience Coding")plt.xlabel("Professional Years of Experience Coding")plt.ylabel("Count")plt.savefig(output_png)def make_histogram_salary(salary_col, output_png):plt.figure(figsize=(8,6))plt.hist(x=salary_col, bins=25)plt.title("Applicant Distribution of Previous Job's Salary")plt.xlabel("Previous Salary ($)")plt.ylabel("Count")plt.savefig(output_png)def make_histogram_skills(skills_col, output_png):plt.figure(figsize=(8,6))plt.hist(x=skills_col, bins=50)plt.title("Applicant Distribution of Number of Computer Skills Listed")plt.xlabel("Applicant Number of Listed Computer Skills")plt.ylabel("Count")plt.savefig(output_png)def make_scatter_skill_salary(skills_col, salary_col, output_png):plt.figure(figsize=(8,6))plt.scatter(skills_col, salary_col, s=4)plt.title("Applicant Number of Skills vs Previous Job's Salary")plt.xlabel("Applicant Number of Listed Computer Skills")plt.ylabel("Salary ($)")plt.savefig(output_png)def make_scatter_experience_salary(experience_col, salary_col, output_png):plt.figure(figsize=(8,6))plt.scatter(experience_col, salary_col, s=4)plt.title("Applicant Professional Years of Coding Experience vs Previous Job's Salary")plt.xlabel("Applicant Professional Years of Coding Experience")plt.ylabel("Salary ($)")plt.savefig(output_png)def make_plot_experience_median_salary(experience_col, salary_col, output_png):min_year = min(experience_col)max_year = max(experience_col)years_exp = list()median_salaries = list()for y in range(min_year, max_year+1):year_salaries = salary_col[experience_col == y]years_exp.append(y)median_salaries.append(year_salaries.median())plt.figure(figsize=(8,6))plt.plot(years_exp, median_salaries)plt.title("Applicant Professional Years of Coding Experience vs Median Previous Job's Salary")plt.xlabel("Applicant Professional Years of Coding Experience")plt.ylabel("Median Salary ($)")plt.savefig(output_png)if __name__ == "__main__":df = get_kaggle_data()print("making bar chart of age")make_bar_age(df["Age"], "raw_bar_age.png")print("making bar chart of gender")make_bar_gender(df["Gender"], "raw_bar_gender.png")print("making bar chart of developer types")make_bar_dev(df["MainBranch"], "raw_bar_dev.png")print("making histogram of years coding")make_histogram_years_code(df["YearsCode"], "raw_hist_years_coding.png")print("making histogram of professional years coding")make_histogram_years_code_pro(df["YearsCodePro"], "raw_hist_years_coding_pro.png")print("making histogram of previous salary")make_histogram_salary(df["PreviousSalary"], "raw_hist_salary.png")print("making histogram of computer skills")make_histogram_skills(df["ComputerSkills"], "raw_hist_skills.png")print("making scatter of skills vs previous salary")make_scatter_skill_salary(df["ComputerSkills"], df["PreviousSalary"], "raw_scatter_skill_salary.png")print("making scatter of pro years exp vs previous salary")make_scatter_experience_salary(df["YearsCodePro"], df["PreviousSalary"], "raw_scatter_experience_salary.png")print("making plot of pro years exp vs median previous salary")make_plot_experience_median_salary(df["YearsCodePro"], df["PreviousSalary"], "raw_plot_experience_median_salary.png")
The age of participants is well balanced between the 2 categories of younger and older than 35. The ratio of men vs other genders present in the survey is vast. Majority (92%) of the respondents considered themselves professional developers. There is a clear visible shift in the distributions when looking at the general years of experience coding vs professional years of experience indicating most people started coding before they became professional. There seems to be a wide range in the number of computer skills a person may list but the salary ranges are completely spanned from the low end to the very top end regardless of whether no or very few computer skills were listed or not. The scatter plot of pro years experience show that above 40 years is very rare which is further indicated by the drop in the plot of median salaries after 40 years of experience.
code_clean.py
from code_get import get_kaggle_datadef clean_data(df):print("number of records: %d" % len(df))print("removing unused columns: 'Accessibility', 'HaveWorkedWith', 'Employment'")df2 = df.drop(columns=["Accessibility", "HaveWorkedWith", "Employment"])print("removing rows of non-devs")df2 = df2[df2["MainBranch"] == "Dev"]print("removing outliers of >42 years pro experience")df2 = df2[df2["YearsCodePro"] <= 42]return df2if __name__ == "__main__":df = get_kaggle_data()print("number of nulls in each col:")print(df.isnull().sum())print()clean_df = clean_data(df)print("cleaned data set number of records: %d" % len(clean_df))#print(clean_df)
code_clean.py output
loading from stackoverflow.csvnumber of nulls in each col:Unnamed: 0 0Age 0Accessibility 0EdLevel 0Employment 0Gender 0MentalHealth 0MainBranch 0YearsCode 0YearsCodePro 0Country 0PreviousSalary 0HaveWorkedWith 63ComputerSkills 0Employed 0dtype: int64number of records: 73462removing unused columns: 'Accessibility', 'HaveWorkedWith', 'Employment'removing rows of non-devsremoving outliers of >42 years pro experiencecleaned data set number of records: 67265
code_new_vis.py
from code_get import get_kaggle_datafrom code_clean import clean_datafrom code_raw_vis import (make_bar_age,make_bar_gender,make_bar_dev,make_histogram_years_code,make_histogram_years_code_pro,make_histogram_salary,make_histogram_skills,make_scatter_skill_salary,make_scatter_experience_salary,make_plot_experience_median_salary,)if __name__ == "__main__":df = get_kaggle_data()clean_df = clean_data(df)print("making bar chart of age")make_bar_age(clean_df["Age"], "new_bar_age.png")print("making bar chart of gender")make_bar_gender(clean_df["Gender"], "new_bar_gender.png")print("making bar chart of developer types")make_bar_dev(clean_df["MainBranch"], "new_bar_dev.png")print("making histogram of years coding")make_histogram_years_code(clean_df["YearsCode"], "new_hist_years_coding.png")print("making histogram of professional years coding")make_histogram_years_code_pro(clean_df["YearsCodePro"], "new_hist_years_coding_pro.png")print("making histogram of previous salary")make_histogram_salary(clean_df["PreviousSalary"], "new_hist_salary.png")print("making histogram of computer skills")make_histogram_skills(clean_df["ComputerSkills"], "new_hist_skills.png")print("making scatter of skills vs previous salary")make_scatter_skill_salary(clean_df["ComputerSkills"], clean_df["PreviousSalary"], "new_scatter_skill_salary.png")print("making scatter of pro years exp vs previous salary")make_scatter_experience_salary(clean_df["YearsCodePro"], clean_df["PreviousSalary"], "new_scatter_experience_salary.png")print("making plot of pro years exp vs median previous salary")make_plot_experience_median_salary(clean_df["YearsCodePro"], clean_df["PreviousSalary"], "new_plot_experience_median_salary.png")