概述
词云分析:
## package: readxl
用于读取Excel文件
## package: ggplot2 用于绘制各类图表
## package: jiebaR 用于分词
## package: wordcloud2 用于绘制词云图
library(readxl)
library(ggplot2)
library(jiebaR)
library(wordcloud2)
options(scipen = 200)
## 去除科学计数法
jobinfo = read_excel("jobinfo.xlsx")
## 读取原始数据
str(jobinfo)
## 查看数据结构
jobinfo$最低薪资 = as.numeric(jobinfo$最低薪资)
## 将最低薪资的字符型变量改为数值型变量
jobinfo$最高薪资 = as.numeric(jobinfo$最高薪资)
## 将最高薪资的字符型变量改为数值型变量
## 图片颜色设置,利用rgb函数产生颜色,用于后续画图使用
col1 = rgb(32,40,51,maxColorValue = 255)
##某一种黑色
col2 = rgb(172,22,34,maxColorValue = 255)
##某一种深红
col3 = "indianred " ##某一种浅红
col4 = "dimgrey"
##某一种灰色
############ 行业类别分析
## 从jobinfo中取出行业类别
hangye = jobinfo$行业类别
## 统计各个行业的频率,返回格式为data.frame
hangye = as.data.frame(ftable(hangye))
## 查看行业类别
View(hangye)
###############
## 前五类所占比例最大,绘制饼图展示具体比例;
## 其它由于涉及种类太多,选择使用词云图的方式展示
## 提取频数前五的行业的名称和频数
hangye = hangye[order(-hangye$Freq),]
hangye$hangye = as.character(hangye$hangye)
hangye$hangye[4] = "快速消费品"
## 将快速消费品(食品、饮料、化妆品)改为快速消费品
top5 = hangye[order(-hangye$Freq),][1:5,]
## 根据前五的行业的名称和频数汇表
行业 = factor(top5$hangye,levels = top5$hangye)
频数 = top5$Freq
indusrty.df = data.frame(x = 行业, y = 频数)
ggplot(indusrty.df ,aes(x=行业,y=频数)) + geom_bar(stat = 'identity',fill=col4)
##############“其它”行业的词云图,先把前五个行业 和 “无”的去除,然后绘制词云图
hangye$Freq[which(hangye$hangye=="无")] = 0
hangye$Freq[which(hangye$hangye==top5$hangye[1])] = 0
hangye$Freq[which(hangye$hangye==top5$hangye[2])] = 0
hangye$Freq[which(hangye$hangye==top5$hangye[3])] = 0
hangye$Freq[which(hangye$hangye==top5$hangye[4])] = 0
hangye$Freq[which(hangye$hangye==top5$hangye[5])] = 0
wordcloud2(hangye, size = 0.2,shape = "diamond") #词云图
##############岗位描述处理
mixseg = worker() ##
按照缺省值,设置分词引擎
subdata1 = as.character(jobinfo$描述[1])
#读入第一条数据的岗位描述,以其为例,进行分词测试
fenci = mixseg[subdata1] #分词
fenci # 展示第一条数据的岗位描述的分词结果,以下循环为依次对数据集中所有的岗位描述进行分词
for (i in 2:length(jobinfo$描述)){
subdata = as.character(jobinfo$描述[i])
subfenci = mixseg[subdata]
fenci = c(fenci,subfenci)
}
###################把大小写统一 同义词替换
fenci[which(fenci=="excel")] = "Excel"
fenci[which(fenci=="EXCEL")] = "Excel"
fenci[which(fenci=="r")] = "R"
fenci[which(fenci=="spss")] = "SPSS"
fenci[which(fenci=="Spss")] = "SPSS"
fenci[which(fenci=="python")] = "Python"
fenci[which(fenci=="Matlab")] = "MATLAB"
fenci[which(fenci=="matlab")] = "MATLAB"
fenci[which(fenci=="java")] = "Java"
fenci[which(fenci=="Sql")] = "SQL"
fenci[which(fenci=="sql")] = "SQL"
fenci[which(fenci=="sas")] = "SAS"
fenci[which(fenci=="WORD")] = "Word"
fenci[which(fenci=="word")] = "Word"
fenci[which(fenci=="ppt")] = "PPT"
fenci[which(fenci=="Ppt")] = "PPT"
fenci[which(fenci=="Office")] = "office"
fenci[which(fenci=="spark")] = "Spark"
fenci[which(fenci=="SPARK")] = "Spark"
fenci[which(fenci=="STATA")] = "Stata"
fenci[which(fenci=="stata")] = "Stata"
fenci[which(fenci=="HADOOP")] = "hadoop"
fenci[which(fenci=="Hadoop")] = "hadoop"
fenci[which(fenci=="Eviews")] = "EViews"
fenci[which(fenci=="EVIEWS")] = "EViews"
fenci[which(fenci=="eviews")] = "EViews"
fenci[which(fenci=="实时")] = "按时"
fenci[which(fenci=="及时")] = "按时"
fenci[which(fenci=="大学本科")] = "本科"
fenci[which(fenci=="挖掘")] = "数据挖掘"
fenci[which(fenci=="思维")] = "逻辑思维"
fenci[which(fenci=="逻辑")] = "逻辑思维"
fenci[which(fenci=="协作")] = "合作"
fenci[which(fenci=="合作伙伴")] = "合作"
fenci[which(fenci=="敬业")] = "敬业精神"
fenci[which(fenci=="执行")] = "执行力"
fenci[which(fenci=="编写")] = "撰写"
fenci[which(fenci=="专业本科")] = "专业"
fenci[which(fenci=="快")] = "快速"
###########
以下部分为去掉无意义的停词,由于停词太多,为了方便运行速度的考虑,将停词分为两个txt
thetable = table(fenci)
#统计分词词频
dftable = as.data.frame(thetable) # 将格式转成data.frame格式
dftable = dftable[which(dftable$Freq>500),] #提取词频数大于500的词研究
stopwords = unlist(read.table("stopwords.txt",stringsAsFactors=F,fileEncoding = "GB2312")) #读入停词表
dftable1 = dftable
# 根据读入的停词表,去掉停词
for (i in length(dftable$fenci):1) {
for (j in 1:length(stopwords)) {
if (dftable$fenci[i]==stopwords[j]){
dftable1 = dftable1[-i,]
}
}
}
##########读入第二个停词表并去停词
newstopwords = unlist(read.table("newstopwords.txt",stringsAsFactors=F,fileEncoding = "GB2312"))
dftable2 = dftable1
for (i in length(dftable1$fenci):1) {
for (j in 1:length(newstopwords)) {
if (dftable1$fenci[i]==newstopwords[j]){
dftable2 = dftable2[-i,]
}
}
}
###########绘制高频词汇图
order = dftable2[order(-dftable2$Freq),] #按照词频将分词重新排列
order11 = order[1:11,] #提取前11位
View(order11)
## 由于“能力”一词与其他词语属于包含关系,下列各词均是能力的体现,故将能力一词删去
order10 = order11[-1,]
###############
根据order11高频词汇及其数量绘制高频词汇表
分词 = factor(order10$fenci,levels = order10$fenci)
频数 = order10$Freq
df = data.frame(x = 分词, y = 频数)
ggplot(df,aes(x=分词,y=频数)) + geom_bar(stat = 'identity',fill=col4)+
labs(x="描述关键词")
#################### 去除高频词汇后绘制词云图
redftable2 = dftable2
for (i in 1:length(order11$Freq)){
#将频数较大的前11个的频数变为零
redftable2$Freq[which(dftable2$fenci==order11$fenci[i])]=0
}
## 画词云
wordcloud2(redftable2)
最后
以上就是安静铃铛为你收集整理的招聘数据分析<三>的全部内容,希望文章能够帮你解决招聘数据分析<三>所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复