/
group_spider_py27.py
154 lines (141 loc) · 5.53 KB
/
group_spider_py27.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
# @Time : 2019/2/25 13:33
# @Author : Spareribs
# @File : group_spider.py
# @Software : PyCharm
# @Description :
"""
import json
import re
import urllib
import requests
from urllib import quote
headers = {
'accept': "application/json, text/plain, */*",
'accept-encoding': "gzip, deflate, br",
'accept-language': "en,zh-CN;q=0.9,zh;q=0.8,zh-TW;q=0.7",
# ********************************
# 以下内容要根据实际情况修改
# ********************************
'user-agent': "****",
# 将所有cookie的内容写上
'cookie': "UM_distinctid=****;zsxq_access_token=****;_uab_collina=****;_umdata=****;avatar_url=****;cna=****;isg=****;name=****;upload_channel=****;user_id=****;ws_address=****;"
}
def get_group_topics(headers, groups_id):
"""
通过小组获取小组内所有的主题ID
# TODO 目前只有7个主题,不知道超过count=20会有什么坑
:param headers: 请求所需要的头部信息
:param groups_id: 小组ID
:return: 主题ID列表
"""
url = "https://api.zsxq.com/v1.10/groups/{0}/topics?count=20".format(groups_id)
print(u"[Info]:开始抓取所有主题:\n{0}".format(url))
topic_ids = []
response = requests.request("GET", url, headers=headers)
# print(response.content)
res_dict = json.loads(response.text)
topics = res_dict.get("resp_data").get("topics")
# print(topics)
if topics:
for _topic in topics:
try:
topic_id = _topic.get("topic_id")
_text = _topic.get("talk").get("text")
topic_ids.append(topic_id)
# print(topic_id, _text)
# print(u"[Info]: {0}".format(_text))
except AttributeError as e:
print(e)
return topic_ids
def get_topics_comments(headers, topics_id, begin_time=None):
"""
通过主题获取小组内所有的评论内容,获得打卡信息和链接
:param headers: 请求所需要的头部信息
:param topics_id: 主题ID
:param begin_time: 偏移量, 这里通过实践来确定起始位置
:return: 最后一个人的 create_time
"""
if begin_time:
url = "https://api.zsxq.com/v1.10/topics/{0}/comments?" \
"count=30&sort=asc&begin_time={1}".format(topics_id, begin_time)
else:
url = "https://api.zsxq.com/v1.10/topics/{0}/comments?count=30&sort=asc".format(topics_id)
print(url)
response = requests.request("GET", url, headers=headers)
# print(response.content)
res_dict = json.loads(response.text)
comments = res_dict.get("resp_data").get("comments")
# print(comments)
create_time = None
if comments:
for _comment in comments:
try:
_create_time = _comment.get("create_time")
create_time = _create_time
_text = _comment.get("text").encode("utf-8")
# print(_text)
# print(_text.split("<")[0])
_id = re.findall(r'[1-9]\d*', _text.split("<")[0])[0]
_url = urllib.unquote(urllib.unquote(re.findall(r'href="(.*?)"', _text.split("<")[1])[0]))
print("学员编号: {0:>3} 打卡时间:{1} 链接:{2}".format(_id, _create_time, _url))
except AttributeError as e:
# print(e)
pass
except IndexError as e:
# print(e)
pass
except TypeError as e:
# print(e)
pass
return create_time
def get_comments_count(headers, topics_id):
"""
用于获取总评论数
:param headers:
:param topics_id: 主题ID
:return: 返回总评论条数
"""
url = "https://api.zsxq.com/v1.10/topics/{0}".format(topics_id)
print(u"{0}\n[Info]: 开始获取 主题 ** {1} ** 所有评论数 ".format("*" * 40, topics_id))
response = requests.request("GET", url, headers=headers)
# print(response.content)
res_dict = json.loads(response.text)
comments_count = res_dict.get("resp_data").get("topic").get("comments_count")
print("[Info]: 获取完毕 主题 ** {0} ** 评论数为:{1}".format(topics_id, comments_count))
return comments_count
def main():
"""
知识星球各星球 topic_id
Datawhale编程: 222248421551
初级算法梳理:822248424542
初级算法梳理(二):222248481411
高级算法编程:455528252848
Datawhale统计学:455528252888
Datawhale数据挖掘:555541454184
Datawhale爬虫:455528252818
Datawhale-NLP实战:222248424811
Datawhale数据分析:144415141422
Datawhale-MySQL:822245485482
Datawhale数据竞赛:455528281128
Leetcode集训打卡:555515244884
Datawhale知乎小组:458152115158
Datawhale-Python基础:145841884822
"""
_topic_id_list = get_group_topics(headers, "144415141422")
for _topic_id in _topic_id_list:
# 获取评论数
comments_count = get_comments_count(headers, _topic_id)
print(comments_count)
# 首次获取 topic_id 的数据
create_time = get_topics_comments(headers, _topic_id)
# 如果评论数超过30 继续获取 topic_id 的数据
while comments_count > 30:
create_time = quote(create_time, 'utf-8')
create_time = get_topics_comments(headers, _topic_id, create_time) # 第二次请求都要加create_time
comments_count -= 30
# TODO 解决create_time编码格式的问题
if __name__ == "__main__":
main()