1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
|
import requests import json import re import sys import os import jieba import pandas as pd import numpy from wordcloud import WordCloud import matplotlib.pyplot as plt from os import path import numpy as np from PIL import Image
def get_comments(): all_comments = "" fetchJSON_comment = "fetchJSON_comment9" skuID = "1109759" for i in range(1, 5): url2 = str(i) url1c = 'https://club.jd.com/comment/productPageComments.action?callback=' + \ fetchJSON_comment+url2+'&productId='+skuID+'&score=0&sortType=5&page=' url3c = '&pageSize=10&isShadowSku=0&rid=0&fold=1'
finalurlc = url1c+url2+url3c xba = requests.get(finalurlc) print(finalurlc, xba.text[0:len(fetchJSON_comment+url2)+1]) data = json.loads(xba.text[len(fetchJSON_comment+url2)+1:-2]) for j in data['comments']: content = j['content'] all_comments = all_comments+content print(i, xba.text[0:20]) return all_comments
xt=""
def data_clear(xt): xt = get_comments() sys.exit(xt) pattern = re.compile(r'[\u4e00-\u9fa5]+') filedata = re.findall(pattern, xt) xx = ''.join(filedata) clear = jieba.lcut(xx) cleared = pd.DataFrame({'keywords': clear}) stopwords = pd.read_csv("chineseStopWords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='GBK') cleared = cleared[~cleared.keywords.isin(stopwords.stopword)] count_words = cleared.groupby('keywords')['keywords'].agg(total='count') count_words = count_words.reset_index().sort_values( by=["total"], ascending=False) xt = count_words return count_words
def make_wordclound(): word_frequence = {x[0]: x[1] for x in data_clear(xt).head(200).values} wordcloud = WordCloud(font_path="simsun.ttc", background_color="#EEEEEE", max_font_size=250, width=2100, height=1200) wordcloud = wordcloud.fit_words(word_frequence) plt.imshow(wordcloud) plt.axis("off") plt.show()
if __name__ == "__main__": make_wordclound()
|