spider.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. from bs4 import BeautifulSoup
  2. import requests
  3. import operator
  4. import base64
  5. import time
  6. from Crypto.Cipher import AES
  7. import oss2
  8. faas_function_name__support = 'support'
  9. faas_function_name__handle = 'handle'
  10. const_key_function_id = "faas__function-id"
  11. const_key_request_wrap = "faas__request_wrap"
  12. const_key_params = "faas__req.params"
  13. const_key_context = "faas__req.context"
  14. const_key_result = "faas__invoke_result"
  15. const_key_result_success = "faas__success"
  16. const_key_reload = "faas__reload"
  17. const_key_trace_id = "faas__trace_id"
  18. endpoint = 'http://oss-cn-hongkong.aliyuncs.com' # Suppose that your bucket is in the Hangzhou region.
  19. auth = oss2.Auth('LTAI5t5ZyATP7DL4nMdK51eL', 'WnJqm3jpuP0JUMo3to8wzcVHgIrBAB')
  20. bucket = oss2.Bucket(auth, endpoint, 'oss-aliyun-hk')
  21. # 判断流程能否处理
  22. def support(event):
  23. return True
  24. class Aes_byte:
  25. iv: bytes = []
  26. key: bytes = []
  27. BLOCK_SIZE: int = 16 # 设定字节长度
  28. def __init__(self, key: bytes, iv: bytes, BLOCK_SIZE: int = 16):
  29. self.iv = iv
  30. self.key = key
  31. self.BLOCK_SIZE = BLOCK_SIZE
  32. pass
  33. def __str__(self):
  34. print("AES加密和解密---字节类型")
  35. pass
  36. # 补足字节方法
  37. def pad(self, value) -> bytes:
  38. count = len(value)
  39. if (count % self.BLOCK_SIZE != 0):
  40. add = self.BLOCK_SIZE - (count % self.BLOCK_SIZE)
  41. else:
  42. add = 0
  43. text = value + ("\0".encode() * add) # 这里的"\0"必须编码成bytes,不然无法和text拼接
  44. return text
  45. # 将明文用AES加密
  46. def AES_encrypt(self, data: bytes) -> bytes:
  47. # 将长度不足16字节的字符串补齐
  48. buffter = self.pad(data) # 注意在这个地方要把传过来的数据编码成bytes,不然还是会报上面说的那个错
  49. # 创建加密对象
  50. cryptor = AES.new(self.key, AES.MODE_CBC, self.iv)
  51. # 完成加密
  52. return cryptor.encrypt(buffter) # 将明文用AES加密
  53. # 将明文用AES解密
  54. def AES_decrypt(self, data: bytes) -> bytes:
  55. # 将长度不足16字节的字符串补齐
  56. buffer = self.pad(data) # 注意在这个地方要把传过来的数据编码成bytes,不然还是会报上面说的那个错
  57. # 创建加密对象
  58. cryptor = AES.new(self.key, AES.MODE_CBC, self.iv)
  59. # 完成加密
  60. return cryptor.decrypt(buffer)
  61. def handle(faas__request_wrap):
  62. # 取出参数和输入上下文
  63. params = faas__request_wrap[const_key_params]
  64. context = faas__request_wrap[const_key_context]
  65. # 拿到任务,调用异步执行,后置操作放结果到 mq
  66. if not params['task']:
  67. raise RuntimeError('no task define')
  68. task = params['task']
  69. headers = task['headers']
  70. method = task['requestMethod']
  71. charset = task['charset']
  72. req_start_time = time.time()
  73. r = requests.request(method, task['url'], headers=headers)
  74. req_cost = time.time() - req_start_time
  75. process_start_time = time.time()
  76. content = None
  77. next_command = None
  78. if operator.contains(task['resultType'], '[B'):
  79. video_bytes = r.content
  80. if task['tsPart']['key']['method'] != 'NONE':
  81. key_bytes = base64.b64decode(task['tsPart']['key']['key'])
  82. iv_bytes = base64.b64decode(task['tsPart']['key']['iv'])
  83. video_bytes = Aes_byte(key_bytes, iv_bytes).AES_decrypt(video_bytes)
  84. remote_path = 'spider/' + task['progressId'] + '/0000.ts'
  85. bucket.put_object(remote_path, video_bytes)
  86. content = remote_path
  87. # file_path = task['tsPart']['filePath']
  88. # if not os.path.exists(os.path.dirname(file_path)):
  89. # os.makedirs(os.path.dirname(file_path))
  90. # with open(file_path, 'wb') as f:
  91. # f.write(video_bytes)
  92. # content = 'file_path'
  93. next_command = 'upload_to_oss'
  94. next_command = 'upload_to_disk'
  95. next_command = 'upload_to_ssh'
  96. next_command = 'upload_to_webdav'
  97. # todo 路径放上下文
  98. else:
  99. # encoding是从http中的header中的charset字段中提取的编码方式,若header中没有charset字段则默认为ISO - 8859 - 1 编码模式,则无法解析中文,这是乱码的原因
  100. # apparent_encoding会从网页的内容中分析网页编码的方式,所以apparent_encoding比encoding更加准确。当网页出现乱码时可以把apparent_encoding的编码格式赋值给encoding。
  101. # r.encoding = r.apparent_encoding
  102. r.encoding = charset
  103. soup = BeautifulSoup(r.text, 'html.parser')
  104. if not soup.title:
  105. title = soup.title.text
  106. content = str(soup)
  107. # 更新爬虫任务队列状态
  108. # app.config['crawler_queued'] -= 1
  109. # app.config['crawler_completed'] += 1
  110. # --------------------------------------------------------------------
  111. #
  112. # --------------------------------------------------------------------
  113. process_cost = time.time() - req_start_time
  114. print('spider task: ' + task['requestMethod'] + ' ' + task['url']
  115. + ' req_cost=%.2f' % req_cost + ' process_cost=%.2f' % process_cost)
  116. result = {
  117. 'process_result': {
  118. 'spider': context
  119. },
  120. 'content': content,
  121. 'next-command': next_command
  122. }
  123. return content