首页
学习
活动
专区
工具
TVP
发布
精选内容/技术社群/优惠产品,尽在小程序
立即前往

Python基于NLTK+jieba+SnowNLP的情感分析(二)

#-*- coding: UTF-8 -*-

__author__ = 'Administrator'

import pymysql

import math

import pandas

import jieba

import jieba.analyse

from snownlp import SnowNLP

import nltk

from nltk.classify import NaiveBayesClassifier

from nltk.tokenize import WordPunctTokenizer

from nltk.corpus import stopwords

class dboUtils(object):

def __init__(self):

self.conn = pymysql.connect(host="192.168.1.1", port=3306, user="root", password="123", db="stet",

charset="utf8")

def query(self, sql = ''):

return self.cursor

def main1():

res = dbo.query("SELECT id,title,content FROM topic")

res = res.fetchall()

data = []

for i, idx in enumerate(res):

id = idx[0]

text = idx[2]

if text != '':

nlp = SnowNLP(text)

nlp = nlp.sentiments

dict = {}

dict['id'] = id

dict['title'] = text

dict['score'] = nlp

print(text + " : " + str(nlp))

# data.append(dict)

return data

def getKeywordIds(text, verb1, verb2 ):

keywordids = []

problem = []

for word in word_lst:

idx = verb1.get(word)

if idx != None:

keywordids.append(str(idx))

else:

for word_idx in verb1:

if text.find(word_idx) >= 0:

idx = verb1.get(word_idx)

keywordids.append(str(idx))

for word in word_lst:

idx = verb2.get(word)

if idx != None:

problem.append(str(idx))

else:

for word_idx in verb2:

if text.find(word_idx) >= 0:

idx = verb2.get(word_idx)

problem.append(str(idx))

return [",".join(list(set(keywordids))),",".join(list(set(problem)))]

def main2():

keyword = dbo.query("SELECT id,name,type FROM keyword WHERE appid = 1")

keyword = keyword.fetchall()

# 数据准备

positive_vocab = {}

negative_vocab = {}

neutral_vocab = {}

problom_vocab = {}

for i, idx in enumerate(keyword):

if idx[2] == 1:

positive_vocab[idx[1]] = idx[0]

elif idx[2] == 2:

problom_vocab[idx[1]] = idx[0]

elif idx[2]

negative_vocab[idx[1]] = idx[0]

else:

neutral_vocab[idx[1]] = idx[0]

# 特征提取

positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab.keys()]

negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab.keys()]

neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab.keys()]

train_set = negative_features + positive_features + neutral_features

# 训练

classifier = NaiveBayesClassifier.train(train_set)

# 测试

res = dbo.query("SELECT id,content FROM comments")

res = res.fetchall()

data = []

for i, idx in enumerate(res):

id = idx[0]

text = idx[1]

if text != '':

neg = 0

pos = 0

word_lst = WordPunctTokenizer().tokenize(text)

word_lst = [w for w in word_lst if (w not in stopwords.words('stop_word'))]

for word in word_lst:

classResult = classifier.classify(word_feats(word))

if classResult == 'neg':

neg = neg + 1

if classResult == 'pos':

pos = pos + 1

# score_nltk = 0

# if pos > 0 and neg > 0 :

# score_nltk = float(pos) / len(word_lst)

# if score_nltk > 0.5:

# score_nltk = 1

# elif score_nltk > 0:

# score_nltk = -1

# else:

# score_nltk = 0

# else:

# nlp = SnowNLP(text)

# score = nlp.sentiments

nlp = SnowNLP(text)

score = nlp.sentiments

# print(word_lst)

# print(positive_vocab)

# print(text + " pos: " + str(pos) + " neg: " + str(neg))

# print(text+" pos: "+str(pos)+" neg: "+str(neg)+" : jieba&nltk : "+str(score)+" : nlp : "+str(nlp))

score = round(score,2)

if score > 0.5:

score = 1

keyids = getKeywordIds(text, positive_vocab, problom_vocab )

elif score > 0:

score = -1

keyids = getKeywordIds(text, negative_vocab, problom_vocab)

else:

score = 0

keyids = getKeywordIds(text, negative_vocab, problom_vocab)

dict = {}

dict['id'] = id

dict['score'] = score

# dict['score_nltk'] = score_nltk

dict['keywordids'] = keyids[0]

dict['problem'] = keyids[1]

# dict['content'] = text

data.append(dict)

return data

def word_feats(words):

return dict([(word, True) for word in words])

def sortProblem():

res = dbo.query("SELECT id,name FROM keyword WHERE type = 2")

res = res.fetchall()

data = []

for i, idx in enumerate(res):

id = idx[0]

text = idx[2]

if text != '':

nlp = SnowNLP(text)

nlp = nlp.sentiments

dict = {}

dict['id'] = id

dict['title'] = text

dict['score'] = nlp

print(text + " : " + str(nlp))

# data.append(dict)

return data

if __name__ == '__main__':

global dbo

dbo = dboUtils()

# res = dbo.query("SELECT title,content FROM topic WHERE score = 2")

# res = main1()

res = main2()

df = pandas.DataFrame(res)

# res = df.groupby(['score'], as_index=False)['score'].agg({'cnt': 'count'})

res = df.tail(20)

# pd = df.groupby(by=['score'])

# newdf = df.size()

# res = newdf.reset_index(name='times')

# res = sortProblem()

print(res)

  • 发表于:
  • 原文链接https://kuaibao.qq.com/s/20181223G0GG5L00?refer=cp_1026
  • 腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号(企鹅号)传播渠道之一,根据《腾讯内容开放平台服务协议》转载发布内容。
  • 如有侵权,请联系 cloudcommunity@tencent.com 删除。

扫码

添加站长 进交流群

领取专属 10元无门槛券

私享最新 技术干货

扫码加入开发者社群
领券
http://www.vxiaotou.com