参考资料：

前言

代码实际跑过一遍，原文中有一些错误，都已经修改过来了。大多数修改的地方都用“修改”标注了。要查看相应内容建议使用搜索“修改”查看。

1 导入工具库
2 加载数据集
3 数据预处理
4 数据分析&可视化
5 分析结论

导入工具库

# For loading data
import pandas as pd
import numpy as np

# For SQL queries
import pandasql as ps

# For ploting graph / Visualization
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot
import plotly.figure_factory as ff

import plotly.io as pio
import seaborn as sns
import matplotlib.pyplot as plt

# To show graph below the code or on same notebook
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

# To convert country code to country name
import country_converter as coco

import warnings
warnings.filterwarnings('ignore')

加载数据集

# 利用pd.read_csv读取数据集
salaries = pd.read_csv("./ds_salaries数据集/ds_salaries.csv")
salaries.head()

# Function query to execute SQL queries
def query(query):
    return ps.sqldf(query)

query("""
        select *
        from salaries
        limit 5
""")

数据预处理

# 去掉无用的"Unnamed: 0"这一列
salaries = salaries.drop("Unnamed: 0", axis=1)
salaries

# 查看数据中缺失值的情况
salaries.isna().sum()

# employee_residence 和 company_location 使用的是短国家代码。我们映射替换为国家的全名以便于理解
salaries["employee_residence"] = coco.convert(names = salaries["employee_residence"], to="name")
salaries["company_location"] = coco.convert(names = salaries["company_location"], to="name")
salaries

"""
将experience_level缩写变成全称
CN: Entry level(入门级)<br>
MI: Mid Level(中级)<br>
SE: Senior Level(高级)<br>
EX: Expert Level(资深专家级)
"""
salaries["experience_level"] = query("""SELECT 
                                          REPLACE(
                                            REPLACE(
                                              REPLACE(
                                                REPLACE(
                                                  experience_level, 'MI', 'Mid level'), 
                                                                    'SE', 'Senior Level'), 
                                                                    'EN', 'Entry Level'), 
                                                                    'EX', 'Expert Level') 
                                        FROM 
                                          salaries""")
salaries

"""
对工作形式也做全称替换
FT: Full Time （全职）
PT: Part Time （兼职）
CT：Contract （合同制）
FL：Freelance （自由职业）
"""
salaries['employment_type'] = query("""SELECT 
                                          REPLACE(
                                            REPLACE(
                                              REPLACE(
                                                REPLACE(
                                                  employment_type, 'PT', 'Part Time'), 
                                                                    'FT', 'Full Time'), 
                                                                    'FL', 'Freelance'), 
                                                                    'CT', 'Contract') 
                                        FROM 
                                          salaries""")
salaries

"""
数据集中公司规模字段处理
S：Small （小型）
M：Medium （中型）
L：Large （大型）
采用salaries.repalce({company_size: {}})函数来做替换
注意inplace=True进行本地修改
"""
replace_rule = {
    
    "S": "Small", "M": "Medium", "L": "Large"}
salaries.replace({
    
    "company_size": replace_rule}, inplace=True)

salaries

"""
对远程比率字段也做一些处理
采用salaries["remote_ratio"].repalce()函数来做替换
"""
replace_rule = {
    
    100: 'Fully Remote', 50: 'Partially Remote', 0: 'Non Remote Work'}
salaries["remote_ratio"].replace(replace_rule, inplace=True)

salaries

数据分析&可视化

# 数据科学领域Top10多的职位是？
top10_jobs = query("""\
    select job_title, count(*) as job_count
    from salaries
    group by job_title
    order by job_count desc
    limit 10
""")

# 绘制条形图
data = go.Bar(x = top10_jobs["job_title"], y = top10_jobs["job_count"], # 横轴，纵轴数据
             text = top10_jobs["job_count"], textposition = "outside",  # 标记在纵轴上的文本，位置在内部
             textfont = dict(size = 12, color = "black"),               # 字号是12，颜色是白色
             marker = dict(color = px.colors.qualitative.Alphabet,      # 条形图颜色
                          opacity = 0.9,        # 不透明度
                          line_color = "black", # 条形图外框线的颜色
                          line_width = 1)       # 条形图外框线的宽度
             )

layout = go.Layout(title = {
    
    'text': "<b>Top 10 Data Science Jobs</b>",  # 粗体标题
                            'x':0.5},                                   # 居中显示
                   xaxis = dict(title = '<b>Job Title</b>'), # x轴标题
                   yaxis = dict(title = '<b>Total</b>'),     # y轴标题
                   width = 900,  # 宽
                   height = 600) # 高


fig = go.Figure(data = data, layout = layout) # 生成画图对象
fig.update_layout(plot_bgcolor = '#f1e7d2',   # 图像背景
                 paper_bgcolor = '#f1e7d2')   # 画布背景
fig.show()                                    # 展示图像

# 饼图展示
fig = px.pie(top10_jobs, values="job_count", names="job_title", color_discrete_sequence=px.colors.qualitative.Alphabet) # 生成图片对象

fig.update_layout(title = {
    
    'text': "<b>Distribution of job positions</b>",  # 标题
                            'x':0.5}, # 居中
                   width = 900,
                   height = 600)

fig.update_layout(plot_bgcolor = '#f1e7d2', # 这行其实没用，因为饼图没有背景
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 拥有最多数据科学家的国家
top10_com_loc = query("""
                    SELECT company_location AS company,
                    Count(*) AS job_count
                    FROM salaries
                    GROUP BY company
                    ORDER BY job_count DESC
                    LIMIT 10
""")


data = go.Bar(x = top10_com_loc['company'], y = top10_com_loc['job_count'],
             text = top10_com_loc["job_count"], textposition = "outside",
             textfont = dict(size = 12,
                            color = 'black'),
             marker = dict(color = px.colors.qualitative.Alphabet,
                          opacity = 0.9,
                          line_color = 'black',
                          line_width = 1))


layout = go.Layout(title = {
    
    'text': "<b>Top 10 Data Science Countries</b>", 
                            'x':0.5, 'xanchor': 'center'},
                   xaxis = dict(title = '<b>Countries</b>', tickmode = 'array'),
                   yaxis = dict(title = '<b>Total</b>'),
                   width = 900,
                   height = 600)


fig = go.Figure(data = data, layout = layout)
fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 统计各国支付的标准化工资(美元)的总和，并绘制地形图
temp_df = salaries.groupby('company_location')['salary_in_usd'].sum().reset_index() # reset_index将series变成了dataframe
temp_df['salary_scale'] = np.log10(temp_df['salary_in_usd']) # 对salar_in_usd取对数,用于表示颜色，修改，这里应该是temp_df


fig = px.choropleth(temp_df, locationmode = 'country names', locations = "company_location", # 数据源，指定位置的模式，位置的数据
                   color = "salary_scale", hover_name = "company_location", # 对应的颜色， 对应的名称
                   hover_data = temp_df[['salary_in_usd']],  # 挂的数字，必须是dataframe
                    color_continuous_scale = 'Jet',
                   )


fig.update_layout(title={
    
    'text':'<b>Salaries across the World</b>', 
                         'xanchor': 'center','x':0.5})
fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 平均工资(按照货币类型分组),选择top14
df = salaries.groupby('salary_currency', as_index = False)['salary_in_usd'].mean().sort_values('salary_in_usd', ascending = False) # 修改
# 第一个salary_in_usd最好加上两层中括号，使得到的是一个dataframe，这里只加了一层是因为as_index=False保证了它是一个dataframe
# 在分组的时候指定as_index = False再聚合 和 先分组聚合reset_index()的效果是一样的

# Selecting top 14
df = df.iloc[:14]
fig = px.bar(df, x = 'salary_currency', # px.bar也可以画柱状图
            y = 'salary_in_usd',
            color = 'salary_currency',
            text = round(df['salary_in_usd']),
            color_discrete_sequence = px.colors.qualitative.Safe,
            )

fig.update_traces(textposition="outside") # 将文本放在外面
fig.update_layout(title={
    
    'text':'<b>Average salary as a function of currency</b>', 
                         'xanchor': 'center','x':0.5},
                 xaxis_title = '<b>Currency</b>',
                 yaxis_title = '<b>Mean Salary</b>')
fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 平均工资(按照所在地分组),选择top14
df = salaries.groupby(['company_location'], as_index = False)[['salary_in_usd']].mean().sort_values('salary_in_usd', ascending = False)


#Selecting top 14
df = df.iloc[:14]
fig = px.bar(df, x = 'company_location',
            y = 'salary_in_usd',
            text = df['salary_in_usd'].apply(lambda x: str(round(x/1000, 2))+"k".format(x)), # 修改
            color = 'company_location',
            color_discrete_sequence = px.colors.qualitative.Dark2,
            )

fig.update_traces(textposition="outside") # 将文本放在外面
fig.update_layout(title = {
    
    'text': "<b>Average salary as a function of company location</b>", 
                            'x':0.5, 'xanchor': 'center'},
                   xaxis = dict(title = '<b>Company Location</b>', tickmode = 'array'),
                   yaxis = dict(title = '<b>Mean Salary</b>'),
                   width = 900,
                   height = 600)


fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 数据科学工作经验水平分布
job_exp = query("""
            SELECT experience_level, Count(*) AS job_count
            FROM salaries
            GROUP BY experience_level
            ORDER BY job_count ASC
""")


# 绘制水平柱状图
data = go.Bar(x = job_exp['job_count'], y = job_exp['experience_level'],
              orientation = 'h', text = job_exp['job_count'], # orientation表示朝向,'h'表示水平
              marker = dict(color = px.colors.qualitative.Alphabet,
                          opacity = 0.9,
                          line_color = 'white',
                          line_width = 2))


layout = go.Layout(title = {
    
    'text': "<b>Jobs on Experience Levels</b>",
                           'x':0.5, 'xanchor':'center'},
                  xaxis = dict(title='<b>Total</b>'),
                  yaxis = dict(title='<b>Experience lvl</b>'),
                  width = 900,
                  height = 600)

fig = go.Figure(data = data, layout = layout)
fig.update_layout(plot_bgcolor = '#f1e7d2', 
                  paper_bgcolor = '#f1e7d2')
fig.show()

# 数据科学工作就业类型分布
job_emp = query("""
SELECT employment_type,
COUNT(*) AS job_count
FROM salaries
GROUP BY employment_type
ORDER BY job_count ASC
""")


data =  go.Bar(x = job_emp['job_count'], y = job_emp['employment_type'], 
               orientation ='h',text = job_emp['job_count'],
               textposition ='outside',
               marker = dict(color = px.colors.qualitative.Alphabet,
                             opacity = 0.9,
                             line_color = 'white',
                             line_width = 2))


layout = go.Layout(title = {
    
    'text': "<b>Jobs on Employment Type</b>",
                           'x':0.5, 'xanchor': 'center'},
                   xaxis = dict(title='<b>Total</b>', tickmode = 'array'),
                   yaxis =dict(title='<b>Emp Type lvl</b>'),
                   width = 900,
                   height = 600)


fig = go.Figure(data = data, layout = layout)
fig.update_layout(plot_bgcolor = '#f1e7d2', 
                  paper_bgcolor = '#f1e7d2')
fig.show()

# 数据科学工作数量趋势(2020-2022)
job_year = query("""
    SELECT work_year, COUNT(*) AS 'job count'
    FROM salaries
    GROUP BY work_year
    ORDER BY 'job count' DESC
""")


data = go.Scatter(x = job_year['work_year'], y = job_year['job count'], # go模块绘制散点图
                  marker = dict(size = 20,            # 散点大小
                                line_width = 1.5,     # 散点的外框线宽
                                line_color = 'white', # 散点的外框线颜色
                                color = px.colors.qualitative.Alphabet), # 散点颜色
                  line = dict(color = '#ED7D31', width = 4), 
                  mode = 'lines+markers') # 散点和线都画


layout  = go.Layout(title = {
    
    'text' : "<b><i>Data Science jobs Growth (2020 to 2022)</i></b>",
                             'x' : 0.5, 'xanchor' : 'center'},
                    xaxis = dict(title = '<b>Year</b>'),
                    yaxis = dict(title = '<b>Jobs</b>'),
                    width = 900,
                    height = 600)


fig = go.Figure(data = data, layout = layout)
fig.update_xaxes(tickvals = ['2020','2021','2022'])
fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 数据科学工作薪水分布
salary_usd = query("""
                    SELECT salary_in_usd 
                    FROM salaries
""")

# 绘制直方图和核密度曲线
plt.figure(figsize = (20, 8))
sns.set(rc = {
    
    'axes.facecolor' : '#f1e7d2',   # 背景颜色
             'figure.facecolor' : '#f1e7d2'}) # 图形颜色

p = sns.histplot(salary_usd["salary_in_usd"], 
                kde = True,         # 计算核密度估计,用一条或多条线来近似分布
                alpha = 1,          # 透明度,如果是distplot则没有这个参数
                fill = True,        # 填充直方图下面的空间,默认为True
                edgecolor = 'black',# 直方图的外框颜色为黑色
                linewidth = 1       # 直方图的外框线宽
                )
p.axes.lines[0].set_color("orange") # 核密度曲线的颜色标为橙色
plt.title("Data Science Salary Distribution \n", fontsize = 25)
plt.xlabel("Salary", fontsize = 18)
plt.ylabel("Count", fontsize = 18)
plt.show()

# 薪酬最高的10大数据分析工作
salary_hi10 = query("""
    SELECT job_title,
    salary
    FROM salaries
    ORDER BY salary_in_usd DESC
    LIMIT 10
""") # 修改

data = go.Bar(x = salary_hi10['salary'],
             y = salary_hi10['job_title'],
             orientation = 'h',
             text = salary_hi10['salary'],
             textposition = 'inside',
             insidetextanchor = 'middle', # 文本锚点在中间
             textfont = dict(size = 13, color = 'black'),
             marker = dict(color = px.colors.qualitative.Alphabet,
                           opacity = 0.9,
                           line_color = 'black',
                           line_width = 1))

layout = go.Layout(title = {
    
    'text': "<b>Top 10 Highest paid Data Science Jobs</b>",
                           'x':0.5,
                           'xanchor': 'center'},
                   xaxis = dict(title = '<b>salary</b>', tickmode = 'array'),
                   yaxis = dict(title = '<b>Job Title</b>'),
                   width = 900,
                   height = 600)
fig = go.Figure(data = data, layout = layout)
fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 不同岗位平均薪资与排名
salary_av10 = query("""
    SELECT job_title,
    ROUND(AVG(salary_in_usd)) AS salary
    FROM salaries
    GROUP BY job_title
    ORDER BY salary DESC
    LIMIT 10
""")

data = go.Bar(x = salary_av10['salary'],
             y = salary_av10['job_title'],
             orientation = 'h',
             text = salary_av10['salary'],
             textposition = 'inside',
             insidetextanchor = 'middle',
              textfont = dict(size = 13,
                             color = 'black'),
              marker = dict(color = px.colors.qualitative.Alphabet,
                           opacity = 0.9,
                           line_color = 'white',
                           line_width = 2))

layout = go.Layout(title = {
    
    'text': "<b>Top 10 Average paid Data Science Jobs</b>",
                           'x':0.5,
                           'xanchor': 'center'},
                   xaxis = dict(title = '<b>salary</b>', tickmode = 'array'),
                   yaxis = dict(title = '<b>Job Title</b>'),
                   width = 900,
                   height = 600)
fig = go.Figure(data = data, layout = layout)
fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 数据科学薪资趋势
salary_year = query("""
    SELECT ROUND(AVG(salary_in_usd)) AS salary,
    work_year AS year
    FROM salaries
    GROUP BY work_year
    ORDER BY salary DESC
""") # 修改

data = go.Scatter(x = salary_year['year'],
                  y = salary_year['salary'],
                  marker = dict(size = 20,
                  line_width = 1.5,
                  line_color = 'black',
                  color = '#ED7D31'),
                  line = dict(color = 'black', width = 4), mode = 'lines+markers')

layout = go.Layout(title = {
    
    'text' : "<b>Data Science Salary Growth (2020 to 2022) </b>",
                            'x' : 0.5,
                            'xanchor' : 'center'},
                   xaxis = dict(title = '<b>Year</b>'),
                   yaxis = dict(title = '<b>Salary</b>'),
                   width = 900,
                   height = 600)


fig = go.Figure(data = data, layout = layout)
fig.update_xaxes(tickvals = ['2020','2021','2022'])
fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 经验水平&薪资

salary_exp = query("""
    SELECT experience_level AS 'Experience Level',
    salary_in_usd AS Salary
    FROM salaries
""")

# 绘制小提琴图
fig = px.violin(salary_exp, x = 'Experience Level', y = 'Salary', color = 'Experience Level', box = True)

fig.update_layout(title = {
    
    'text': "<b>Salary on Experience Level<br>经验水平&薪资</b>",
                            'xanchor': 'center','x':0.5},
                   xaxis = dict(title = '<b>Experience level</b>'),
                   yaxis = dict(title = '<b>salary</b>', 
                                ticktext = [-300000, 0, 100000, 200000, 300000, 400000, 500000, 600000, 700000]),
                   width = 900,
                   height = 600)

fig.update_layout(paper_bgcolor= '#f1e7d2', 
                  plot_bgcolor = '#f1e7d2', 
                  showlegend = False)
fig.show()

# 不同经验水平的薪资趋势
tmp_df = salaries.groupby(['work_year', 'experience_level']).median() # 按照工作年份和经验水平分组,只有数字类型会被求中位数
tmp_df.reset_index(inplace = True)
display(tmp_df.head()) # 修改:打印dataframe的开头五行

fig = px.line(tmp_df, x='work_year', y='salary_in_usd', color='experience_level', symbol="experience_level") # 绘制多条折线图

fig.update_layout(title = {
    
    'text': "<b>Median Salary Trend By Experience Level<br>不同经验水平的薪资趋势</b>", 
                            'x':0.5, 'xanchor': 'center'},
                  xaxis = dict(title = '<b>Working Year</b>', tickvals = [2020, 2021, 2022], tickmode = 'array'),
                  yaxis = dict(title = '<b>Salary</b>'),
                  width = 900,
                  height = 600)

fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 年份&薪资分布
year_gp = salaries.groupby('work_year')
hist_data = [year_gp.get_group(2020)['salary_in_usd'],
             year_gp.get_group(2021)['salary_in_usd'],
            year_gp.get_group(2022)['salary_in_usd']]
group_labels = ['2020', '2021', '2022']

fig = ff.create_distplot(hist_data, group_labels, show_hist = False) #  绘制多条核密度曲线


fig.update_layout(title = {
    
    'text': "<b>Salary Distribution By Working Year<br>年份&薪资分布</b>", 
                            'x':0.5, 'xanchor': 'center'},
                  xaxis = dict(title = '<b>Salary</b>'),
                  yaxis = dict(title = '<b>Kernel Density</b>'),
                  width = 900,
                  height = 600)

fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 就业类型&薪资
salary_emp = query("""
    SELECT employment_type AS 'Employment Type',
    salary_in_usd AS Salary
    FROM salaries
""")

# 绘制箱线图
fig = px.box(salary_emp,x='Employment Type',y='Salary',
       color = 'Employment Type')


fig.update_layout(title = {
    
    'text': "<b>Salary by Employment Type</b>", 
                            'x':0.5, 'xanchor': 'center'},
                  xaxis = dict(title = '<b>Employment Type</b>'),
                  yaxis = dict(title = '<b>Salary</b>'),
                  width = 900,
                  height = 600)

fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 公司规模分布
comp_size = query("""
                SELECT company_size,
                COUNT(*) AS count
                FROM salaries
                GROUP BY company_size
""")

# 绘制环形图(饼图中间镂空)
data = go.Pie(labels = comp_size['company_size'], 
              values = comp_size['count'].values, # .values可写可不写
              hoverinfo = 'label',                # 移动到饼图上时会显示的信息
              hole = 0.5, # 中间镂空
              textfont_size = 16,
              textposition = 'auto')
fig = go.Figure(data = data)


fig.update_layout(title = {
    
    'text': "<b>Company Size</b>", 
                            'x':0.5, 'xanchor': 'center'},
                  xaxis = dict(title = '<b></b>'),
                  yaxis = dict(title = '<b></b>'),
                  width = 900,
                  height = 600)

fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 不同公司规模的经验水平比例

df = salaries.groupby(['company_size', 'experience_level']).size() # size()聚合方法是考虑有多少行,一定返回series,count()会考虑每列,如果有空值就不计入
comp_s = np.round(df['Small'].values / df['Small'].values.sum(),2) # values得到的是ndarray,这里的计算用到广播
comp_m = np.round(df['Medium'].values / df['Medium'].values.sum(),2)
comp_l = np.round(df['Large'].values / df['Large'].values.sum(),2)

fig = go.Figure()
categories = ['Entry Level', 'Expert Level','Mid level','Senior Level']

# 绘制极坐标图
fig.add_trace(go.Scatterpolar( # add_trace方法增加一条轨迹
    r = comp_s,
    theta = categories, # 设置角坐标
    fill = 'toself',    # 起点和终点连线,形成闭环的图形
    name = 'Company Size S'))

fig.add_trace(go.Scatterpolar(
    r = comp_m,
    theta = categories,
    fill = 'toself',
    name = 'Company Size M'))

fig.add_trace(go.Scatterpolar(
    r = comp_l,
    theta = categories,
    fill = 'toself',
    name = 'Company Size L'))

fig.update_layout(
    polar = dict(
    radialaxis = dict(range = [0, 0.6])), # 极坐标幅度为[0. 0.6]
    showlegend = True,
)


fig.update_layout(title = {
    
    'text': "<b>Proportion of Experience Level In Different Company Sizes</b>", 
                            'x':0.5, 'xanchor': 'center'},
                  xaxis = dict(title = '<b></b>'),
                  yaxis = dict(title = '<b></b>'),
                  width = 900,
                  height = 600)

fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 不同公司规模&工作薪资
salary_size = query("""
    SELECT company_size AS 'Company size',
    salary_in_usd AS Salary
    FROM salaries
""")

fig = px.box(salary_size, x='Company size', y = 'Salary',
             color = 'Company size')



fig.update_layout(title = {
    
    'text': "<b>Salary by Company size</b>", 
                            'x':0.5, 'xanchor': 'center'},
                  xaxis = dict(title = '<b>Company size</b>'),
                  yaxis = dict(title = '<b>Salary</b>'),
                  width = 900,
                  height = 600)

fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# WFH（远程办公）和 WFO 的比例
rem_type = query("""
    SELECT remote_ratio,
    COUNT(*) AS total
    FROM salaries
    GROUP BY remote_ratio
""")


data = go.Pie(labels = rem_type['remote_ratio'], values = rem_type['total'].values,
             hoverinfo = 'label',
             hole = 0.4,
             textfont_size = 18,
             textposition = 'auto')

fig = go.Figure(data = data)

fig.update_layout(title = {
    
    'text': "<b>Remote Ratio</b>", 
                            'x':0.5, 'xanchor': 'center'},
                  width = 900,
                  height = 600)

fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 薪水受Remote Type影响程度
salary_remote = query("""
    SELECT remote_ratio AS 'Remote type',
    salary_in_usd AS Salary
    From salaries
""")

fig = px.box(salary_remote, x = 'Remote type', y = 'Salary', color = 'Remote type')



fig.update_layout(title = {
    
    'text': "<b>Salary by Remote Type</b>", 
                            'x':0.5, 'xanchor': 'center'},
                  xaxis = dict(title = '<b>Remote type</b>'),
                  yaxis = dict(title = '<b>Salary</b>'),
                  width = 900,
                  height = 600)

fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

# 不同经验水平&远程比率
exp_remote = salaries.groupby(['experience_level', 'remote_ratio']).size() # 修改:得到series
display(exp_remote.head())
exp_remote = exp_remote.reset_index().rename(columns={
    
    0: 'cnt'})
display(exp_remote.head())
print(exp_remote.index)
fig = px.histogram(exp_remote, x = 'experience_level',
                  y = 'cnt', color = 'remote_ratio',
                  barmode = 'group', # 同一组的条形图不堆叠
                  text_auto = True)


fig.update_layout(title = {
    
    'text': "<b>Respondent Count In Different Experience Level Based on Remote Ratio</b>", 
                            'x':0.5, 'xanchor': 'center'},
                  xaxis = dict(title = '<b>Experience Level</b>'),
                  yaxis = dict(title = '<b>Number of Respondents</b>'),
                  width = 900,
                  height = 600)

fig.update_layout(plot_bgcolor = '#f1e7d2',
                 paper_bgcolor = '#f1e7d2')
fig.show()

分析结论

数据科学领域Top3多的职位是数据科学家、数据工程师和数据分析师。

数据科学工作越来越受欢迎。员工比例从2020年的11.9%增加到2022年的52.4%。

美国是数据科学公司最多的国家。

工资分布的IQR在62.7k和150k之间。

在数据科学员工中，大多数是高级水平，而专家级则更少。

大多数数据科学员工都是全职工作，很少有合同工和自由职业者。

首席数据工程师是薪酬最高的数据科学工作。

数据科学的最低工资（入门级经验）为4000美元，具有专家级经验的数据科学的最高工资为60万美元。

公司构成：53.7%中型公司，32.6%大型公司，13.7%小型数据科学公司。

工资也受公司规模影响，规模大的公司支付更高的薪水。

62.8%的数据科学是完全远程工作，20.9%是非远程工作，16.3%是部分远程工作。

数据科学薪水随时间和经验积累而增长

[数据分析实战][37] 基于pandasql和plotly的数据科学家薪资分析与可视化 @ShowMeAI