NewsRobot/get_Cnbeta_news.py at master · onecatcn/NewsRobot

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

import requests,os,re,sys

from bs4 import BeautifulSoup

import urllib.parse

import time

from datetime import datetime

import atexit

import signal

HOMEPAGE_URL = 'https://www.cnbeta.com/'

JSON_URL = HOMEPAGE_URL + 'home/more'

AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 OPR/52.0.2871.99'

HEADERS = {

'User-Agent' : AGENT,

'Accept-Language' : 'zh-CN,zh;q=0.9',

'referer' : HOMEPAGE_URL}

class spider(object):

def __init__(self):

print ('开始爬取内容。。。')

def get_resource(self, url, headers):

try:

r = requests.get(url, headers = headers)

r.raise_for_status()

r.encoding = r.apparent_encoding

## print('URL: ' + r.url)

except Exception as e:

print(e)

r = None

finally:

return r

def get_csrf(self, res):

if res is None:

return None, None

soup = BeautifulSoup(res.text, 'html.parser')

#print("soup",soup)

title = soup.head.find('title').get_text()

param = soup.head.find(attrs={'name':'csrf-param'})['content']

token = soup.head.find(attrs={'name':'csrf-token'})['content']

## print('Page Title check: ' + title)

## print('param token check',param + ' = ' + token)

return {'param' : param, 'token' : token}

def get_timestamp_ms(self):

return round(time.time()*1000)

def get_json_url(self, csrf, page):

params = {}

params['type'] = 'all'

params['page'] = page

params[csrf['param']] = csrf['token']

params['_'] = spider.get_timestamp_ms(self)

params_str = urllib.parse.urlencode(params)

return JSON_URL + '?&' + params_str

def get_latest_days_of_year(self, n):

ticks = time.time() - n*24*60*60

return ticks

## today = datetime.now().timetuple().tm_yday

## if today >= n:

## return range(today, today - n, -1)

## else:

## return range(today, 0, -1)

##def saveinfo2(self, classinfo,keyword,fpath):

## filetitle = keyword+"_cnBeta.txt"

## filename = os.path.join( fpath, filetitle )

##

## f = open(filename, 'a')

## f.writelines('title：' + artical_title + '\r\n')

## f.writelines('time：' + inputtime + '\r\n')

## f.writelines('summ：' + artical_summ + '\r\n')

## ## f.writelines('*********************************************\n')

## f.writelines('\r\n')

## f.close()

def print_news(self, news_data, days,keyword_list,fpath):

ticks = spider.get_latest_days_of_year(self,days)

if news_data['result'] is None:

return True

end = False

for news in news_data['result']['list']:

note_time = news['inputtime']

d = datetime.strptime(note_time,"%Y-%m-%d %H:%M")

report_time = time.mktime(d.timetuple())

if (report_time < ticks):

end = True

break

#print("{inputtime} {label[name]} {title:<40} \n{url_show} \n{hometext}".format(**news))

for kword in keyword_list:

keyword = kword #unicode(kword, "utf8")

##print("check keyword",keyword)

pattern = re.compile(keyword,re.IGNORECASE)

if(pattern.findall("{hometext}".format(**news))):

print("hit it",kword)

print("{inputtime} {label[name]} {title:<40} \n{url_show} \n{hometext}".format(**news))

filetitle = keyword+"_cnBeta.txt"

filename = os.path.join( fpath, filetitle )

f = open(filename, 'a')

f.writelines('title：' + "{title:<40}".format(**news) + '\r\n')

f.writelines('time：' + "{inputtime}".format(**news) + '\r\n')

f.writelines('summ：' + "{hometext}".format(**news) + '\r\n')

f.writelines('\r\n')

f.close()

return end

def process_cnBeta(self,fpath,days,keyword_list):

## keyword_list = ['AI','自动驾驶']

## = '/home/kai/05_newsupdate/99_reference/CnbetaNewsSpider-master/99_draft/'

## = ['AI','人工智能','机器学习','深度学习','自然语言处理','NLP','机器视觉','物联网','NB-IoT','智能硬件','机器人','智能家居','智能音箱','amazon','apple','google','区块链','车联网','自动驾驶','5G','工业互联网','机器智能']

## n = 1

homepage_res = spider.get_resource(self, HOMEPAGE_URL, HEADERS)

csrf = spider.get_csrf(self, homepage_res)

for kword in keyword_list:

filetitle = kword+"_cnBeta.txt"

filename = os.path.join( fpath, filetitle )

if(os.path.exists(filename)):

os.system("rm %s" % filename)

## print("remove check point 1")

f = open(filename, 'a')

# f.writelines('*********************************************\n')

f.writelines('*********************************************\r\n')

f.writelines('keyword：' + kword + '\r\n')

f.writelines('*********************************************\r\n')

f.writelines('\r\n')

# f.writelines('*********************************************\n')

f.close()

## days = int(days)

page = 1

end = False

while not end:

json_url = spider.get_json_url(self, csrf, page)

json_res = spider.get_resource(self, json_url, HEADERS)

news_data = json_res.json()

# print("newss_data check",news_data)

end = spider.print_news(self, news_data, days,keyword_list,fpath)

page = page + 1

class cnBeta_Daemon:

def __init__(self, pidfile='/tmp/daemon-example.pid', stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):

self.stdin = stdin

self.stdout = stdout

self.stderr = stderr

self.pidfile = pidfile

def daemonize(self):

if os.path.exists(self.pidfile):

raise RuntimeError('Already running.')

# First fork (detaches from parent)

try:

if os.fork() > 0:

raise SystemExit(0)

except OSError as e:

raise RuntimeError('fork #1 faild: {0} ({1})\n'.format(e.errno, e.strerror))

## print("check daemonize 1")

os.chdir('/')

os.setsid()

os.umask(0o22)

## print("check daemonize 2")

# Second fork (relinquish session leadership)

try:

if os.fork() > 0:

raise SystemExit(0)

except OSError as e:

raise RuntimeError('fork #2 faild: {0} ({1})\n'.format(e.errno, e.strerror))

## print("check daemonize 3")

# Flush I/O buffers

sys.stdout.flush()

sys.stderr.flush()

## print("check daemonize 4")

# Replace file descriptors for stdin, stdout, and stderr

with open(self.stdin, 'rb', 0) as f:

## print("check daemonize 4.1")

## print("check daemonize 4.1:",f.fileno())

## print("check daemonize 4.1:",sys.stdin.fileno())

os.dup2(f.fileno(), sys.stdin.fileno())

with open(self.stdout, 'ab', 0) as f:

## print("check daemonize 4.2")

## print("check daemonize 4.2:",f.fileno())

## print("check daemonize 4.2:",sys.stdout.fileno())

os.dup2(f.fileno(), sys.stdout.fileno())

with open(self.stderr, 'ab', 0) as f:

## print("check daemonize 4.3")

os.dup2(f.fileno(), sys.stderr.fileno())

## print("check daemonize 5")

# Write the PID file

with open(self.pidfile, 'w') as f:

print(os.getpid(), f)

# print(os.getpid(), file=f)

## print("check daemonize 6")

# Arrange to have the PID file removed on exit/signal

atexit.register(lambda: os.remove(self.pidfile))

## print("check daemonize 7")

signal.signal(signal.SIGTERM, self.__sigterm_handler)

## print("check daemonize 8")

# Signal handler for termination (required)

@staticmethod

def __sigterm_handler(signo, frame):

raise SystemExit(1)

def start(self):

try:

## print("check daemon start 1")

self.daemonize()

except RuntimeError as e:

print(e, sys.stderr)

# print(e, file=sys.stderr)

raise SystemExit(1)

self.run()

def stop(self):

try:

if os.path.exists(self.pidfile):

with open(self.pidfile) as f:

os.kill(int(f.read()), signal.SIGTERM)

else:

print('Not running.', sys.stderr)

# print('Not running.', file=sys.stderr)

raise SystemExit(1)

except OSError as e:

if 'No such process' in str(e) and os.path.exists(self.pidfile):

os.remove(self.pidfile)

def restart(self):

self.stop()

self.start()

def run(self,n,keyword_list):

pass

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

get_Cnbeta_news.py

get_Cnbeta_news.py

Files

get_Cnbeta_news.py

Latest commit

History

get_Cnbeta_news.py

File metadata and controls