【实操】SQL数据清洗&TABLEAU数据可视化操作

背景

想要对数据分析岗位的市场需求、就业情况、岗位技能做一个简单调查。利用爬虫采集完毕数据分析的相关信息。只考虑北上广深的一线是城市。

报告先行~

数据分析岗位分析报告

目标确认与目标拆解

图1

数据清洗

  1. 缺失数据处理&限定岗位–>data_clean_null_jobname
    某字段为null或空字符的删去;限定岗位名称带有“数据”二字
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
create view data_clean_null_jobname as (
select *
from data
where
job_href is not null and job_href != '' and
job_name is not null and job_name != '' and
company_href is not null and company_href != '' and
company_name is not null and company_name != '' and
providesalary_text is not null and providesalary_text != '' and
workarea is not null and workarea != '' and
workarea_text is not null and workarea_text != '' and
companytype_text is not null and companytype_text != '' and
degreefrom is not null and degreefrom != '' and
workyear is not null and workyear != '' and
updatedate is not null and updatedate != '' and
issuedate is not null and issuedate != '' and
parse2_job_detail is not null and parse2_job_detail != ''
and job_name like '%数据%'
)
  1. 处理重复发布的数据–>data_clean_distinc
    利用对发布时间的排序可以选择最近发布的重复数据
1
2
3
4
5
6
7
8
9
create view data_clean_distinct as (
select *
from (
select *,
row_number() over(partition by job_name,company_name order by issuedate desc) last_issue
from data_clean_null_jobname
)t
where last_issue=1
)
  1. 限定城市为北上广深四个–>data_clean_don
    建立新的城市字段,方便后续利用字段进行分析
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
create view data_clean_done as (
select *
from (select id,job_href,job_name,company_href,
company_name,providesalary_text,workarea,
workarea_text,updatedate,companytype_text,
degreefrom,workyear,issuedate,parse2_job_detail,
case when workarea_text like'%北京%' then '北京'
when workarea_text like'%上海%' then '上海'
when workarea_text like'%广州%' then '广州'
when workarea_text like'%深圳%' then '深圳' end
as base
from data_clean_distinct
)k
where base is not null
)

数据分析

市场需求分析

  1. 各城市招聘数目对比&占比情况
1
2
3
4
5
6
7
8
9
10
11
12
create view data_base_need as 
select
base as '城市',base_cnt as '招聘量',
concat(round((base_cnt/
(select sum(degreefrom) over() from data_clean_done limit 1)
)*100,2),'%') as '招聘占比'
from (select
base,
sum(degreefrom) as base_cnt
from data_clean_done
group by base)k1
order by base_cnt desc

结果展示图2

  1. 各企业类型招聘数目对比&占比情况
1
2
3
4
5
6
7
8
9
10
11
12
create view data_companytype_need as 
select
companytype_text as '企业类型',company_cnt as '招聘量',
concat(round((company_cnt/
(select sum(degreefrom) over() from data_clean_done limit 1)
)*100,2),'%') as '招聘占比'
from (select
companytype_text,
sum(degreefrom) as company_cnt
from data_clean_done
group by companytype_text)k1
order by company_cnt desc

结果展示图3

  1. 各工作年限招聘数目对比&占比情况
1
2
3
4
5
6
7
8
9
10
11
12
create view data_companytype_need as 
select
workyear as '工作年限',cnt as '招聘量',
concat(round((cnt/
(select sum(degreefrom) over() from data_clean_done limit 1)
)*100,2),'%') as '招聘占比'
from (select
workyear,
sum(degreefrom) as cnt
from data_clean_done
group by workyear)k1
order by cnt desc

结果展示图4

可视化展示:仪表板-需求

薪资情况分析

由于薪资字段还没清洗好,所以规范薪资字段,得到最大值、最小值和平均值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
create view data_salary as 
with temp_data_salary as (
select *,
substring_index(providesalary_text,'-',1)*unit as min_salary,
substring_index(providesalary_text,'-',-1)*unit as max_salary
from (select *,
case
when providesalary_text like '%千/月%' then 1000
when providesalary_text like '%万/月%' then 10000
when providesalary_text like '%万/年%' then 833 end as unit
from data_clean_done)t1
)
select *,(min_salary+max_salary)/2 avg_salary
from temp_data_salary
where min_salary is not null
  1. 各城市平均薪酬情况
1
2
3
4
5
create view data_salary_city as 
select base as '城市',round(avg(avg_salary),0) as '平均薪资'
from data_salary
group by workyear
order by round(avg(avg_salary),0)

结果展示图5

  1. 各企业类型平均薪酬情况
1
2
3
4
5
create view data_salary_company as 
select companytype_text as '企业类型',round(avg(avg_salary),0) as '平均薪资'
from data_salary
group by companytype_text
order by round(avg(avg_salary),0) desc

结果展示图6

  1. 各工作年限平均薪酬情况
1
2
3
4
5
create view data_salary_workyear as 
select workyear as '工作年限',round(avg(avg_salary),0) as '平均薪资'
from data_salary
group by workyear
order by length(workyear),workyear

结果展示图7

可视化展示:仪表板-薪资

核心技能要求分析

  1. 各技能出现频率统计
1
2
3
4
5
6
7
8
9
10
11
create view skill_quantity	as 
select skill as '需求',cnt as '需求量',
concat(round(cnt/(select count(1) from data_clean_done)*100,2),'%') as '占比'
from(
select skill,count(1) cnt
from skill_table
join data_clean_done
on data_clean_done.parse2_job_detail like concat('%',skill_table.skill,'%')
group by skill
order by count(1) desc
)t

结果展示图8

  1. 各企业类型的核心技能要求情况
    由于技能种类太多,所以需要筛选出每种企业类型要求频率前十的技能,并在每个分组里按照技能频率降序排序
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
create view data_skill_companytype as 
select companytype_text'企业类型',skill'技能',cnt'需求量',
concat(round((cnt/cnt_all)*100,2),'%') as '占比'
from (
select *,rank() over(partition by companytype_text order by cnt desc) r
from (
select companytype_text,skill,count(1) cnt
from skill_table
join data_clean_done
on data_clean_done.parse2_job_detail like concat('%',skill_table.skill,'%')
group by companytype_text,skill
)t1
left join
(select companytype_text,count(1) cnt_all
from skill_table
join data_clean_done
on data_clean_done.parse2_job_detail like concat('%',skill_table.skill,'%')
group by companytype_text
)t2
using(companytype_text)
)t3
where r<11
order by companytype_text,cnt desc

结果展示图9

图9

  1. 各工作年限的核心技能要求情况
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
create view data_skill_workyear as 
select workyear'工作年限',skill'技能',cnt'需求量',
concat(round((cnt/cnt_all)*100,2),'%') as '占比'
from (
select *,rank() over(partition by workyear order by cnt desc) r
from (
select workyear,skill,count(1) cnt
from skill_table
join data_clean_done
on data_clean_done.parse2_job_detail like concat('%',skill_table.skill,'%')
group by workyear,skill
)t1
left join
(select workyear,count(1) cnt_all
from skill_table
join data_clean_done
on data_clean_done.parse2_job_detail like concat('%',skill_table.skill,'%')
group by workyear
)t2
using(workyear)
)t3
where r<11
order by length(workyear),workyear,cnt desc

结果展示图10

图10

可视化展示:技能-频次-气泡图

仪表板-技能-条形图

结论

  • 需求方面

    • 数据分析需求最旺盛的城市是上海

    • 数据分析需求最多的企业类型是民营企业

    • 数据分析最需要的工作经验是3-5年

  • 薪资方面

    • 数据分析师的薪资很有竞争力

    • 北京的平均工资最高

    • 1-3年的平均薪资为9k,10年以上的薪资不具备参考

    • 不同企业的平均薪资相差不大

  • 技能方面

    • 核心技能为SQL、大数据、Excel、Python、报告、数据挖掘等

【实操】SQL数据清洗&TABLEAU数据可视化操作
https://liaoweiquan.github.io/2020/11/06/数据分析岗位分析/
作者
泉泉
发布于
2020年11月6日
许可协议