Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

228 rindas
8.7 KiB

  1. # This script is intended for personal and scientific use only
  2. import os
  3. import sys
  4. import re
  5. import hashlib
  6. import json
  7. from time import sleep
  8. import feedparser
  9. import requests
  10. from bs4 import BeautifulSoup, Comment
  11. from bs4.element import CData
  12. # default config location is a 'config.json' next to the script.
  13. try:
  14. filedir = os.path.dirname(os.path.abspath(__file__))
  15. if len(sys.argv) < 2:
  16. configpath = filedir+'/config.json'
  17. print("Using default config location: ", configpath)
  18. config = json.load(open(configpath))
  19. else:
  20. configpath = sys.argv[1]
  21. config = json.load(open(configpath))
  22. except:
  23. print("Problem reading config file: ", configpath)
  24. print("ERROR: Config file not found or invalid!")
  25. sys.exit(1)
  26. public_path = filedir + '/public'
  27. assets_path = public_path + '/assets'
  28. feeds_path = public_path + '/feeds'
  29. # e.g. https://example.com/some-string
  30. assets_url = config['assets_url']
  31. # "I'm a robot which promises you clicks and $ ... Give me ALL your content!"
  32. requestheaders = {
  33. 'user-'+'age'+'nt' :
  34. 'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go'
  35. + 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go'
  36. + 'og'+ 'le'+'.com/'+'bo'+'t.html)'
  37. }
  38. # need filname safe strings for storing images along html files
  39. def get_valid_filename(s):
  40. s = str(s).split('?')[0].strip().strip('/').strip('http://').strip('https://').replace(' ', '-')
  41. return re.sub(r'(?u)[^-\w.]', '-', s)
  42. # Get a unique and valid filename from URL (for images)
  43. def filename_from_url(url):
  44. # remove get attributes and path
  45. new_filename = url.split('?')[0].split('/')[-1]
  46. # Split filename
  47. new_filename = new_filename.split('.')
  48. # insert a hash before suffix
  49. new_filename.insert(1, str(hashlib.md5(url.encode('utf-8')).hexdigest()) )
  50. # convert back to string and extra validate
  51. new_filename = get_valid_filename('.'.join(new_filename))
  52. return new_filename
  53. # Download images and so on
  54. def download_image(url, entry_dir, filename):
  55. # take care of protocol relative URLs ... let's just assume that https works.
  56. if url.startswith('//'):
  57. url = 'https:'+url
  58. response = requests.get(url, headers=requestheaders)
  59. if response.status_code == 200:
  60. with open(assets_path + '/' + entry_dir + '/' + filename, 'wb') as f:
  61. #f.write(response.content)
  62. for chunk in response.iter_content(1024):
  63. f.write(chunk)
  64. def process_feed(obj):
  65. feed_url = obj['source']
  66. output_filename = obj['destination']
  67. print('Updating:', obj['destination'])
  68. # Get the feed
  69. r_feed = requests.get(feed_url, headers=requestheaders)
  70. # TODO: exceptions.(what if 404 or whatever?)
  71. # Store data of new articles
  72. for entry in feedparser.parse(r_feed.text).entries:
  73. entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
  74. entry_path = assets_path + '/'+ entry_dir
  75. if not os.path.exists(entry_path):
  76. print('New item: ', entry.link)
  77. r = requests.get(entry.link.split('?')[0], headers=requestheaders)
  78. online_soup = BeautifulSoup(r.text, 'html.parser')
  79. content_soup = BeautifulSoup('<div></div>', 'html.parser')
  80. # Remove all Comments
  81. for element in online_soup(text=lambda text: isinstance(text, Comment)):
  82. element.extract()
  83. # domain and path specific rules
  84. # ... split strings for (very simple) ob+fu+sca+tion
  85. if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'):
  86. if entry.date:
  87. article_time = content_soup.new_tag('time', datetime=entry.date)
  88. content_soup.div.append(article_time)
  89. article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'})
  90. content_soup.div.append(article_headline)
  91. article_body = online_soup.find('div', attrs={'class': 'story-content'})
  92. content_soup.div.append(article_body)
  93. article_link = content_soup.new_tag('a', href=entry.link)
  94. article_link['class'] = 'source'
  95. article_link.string = 'Quelle (' + entry.link + ')'
  96. content_soup.div.append(article_link)
  97. if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :)
  98. if entry.published:
  99. article_time = content_soup.new_tag('time', datetime=entry.published)
  100. content_soup.div.append(article_time)
  101. article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'})
  102. content_soup.div.append(article_headline)
  103. # images etc
  104. article_aside = online_soup.find('div', id="content-aside")
  105. content_soup.div.append(article_aside)
  106. article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'})
  107. content_soup.div.append(article_body)
  108. # modify original link -> mobile version and comment section
  109. link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#'
  110. article_comments_link = content_soup.new_tag('a', href=link_to_comments)
  111. article_comments_link['class'] = 'comments'
  112. article_comments_p = content_soup.new_tag('p')
  113. article_comments_link.string = 'Kommentare'
  114. article_comments_p.append(article_comments_link)
  115. content_soup.div.append(article_comments_p)
  116. article_link = content_soup.new_tag('a', href=entry.link.split('?')[0])
  117. article_link['class'] = 'source'
  118. article_link.string = 'Quelle: ' + entry.link.split('?')[0]
  119. content_soup.div.append(article_link)
  120. # create directory for storing and serving html and images
  121. os.makedirs(entry_path)
  122. # download all article images and replace image source
  123. for img in content_soup.findAll('img'):
  124. if img.get('data-src'):
  125. old_url = img['data-src']
  126. if not old_url.startswith('data:'):
  127. new_filename = filename_from_url(old_url)
  128. img['data-src'] = assets_url + '/' + entry_dir + '/' + new_filename
  129. download_image(old_url, entry_dir, new_filename)
  130. if img.get('src'):
  131. old_url = img['src']
  132. if not old_url.startswith('data:'):
  133. new_filename = filename_from_url(old_url)
  134. img['src'] = assets_url + '/' + entry_dir + '/' + new_filename
  135. download_image(old_url, entry_dir, new_filename)
  136. if img.get('data-srcset'):
  137. srcset = img['data-srcset'].split(', ')
  138. new_srcset = []
  139. for src in srcset:
  140. old_url = src.split(' ')[0]
  141. src_res = src.split(' ')[1]
  142. new_filename = filename_from_url(old_url)
  143. download_image(old_url, entry_dir, new_filename)
  144. new_url = assets_url + '/' + entry_dir + '/' + new_filename
  145. src = ' '.join([new_url, src_res])
  146. new_srcset.append(src)
  147. img['data-srcset'] = ', '.join(new_srcset)
  148. # TODO(?): HTML5 picture tag
  149. f = open(entry_path + '/index.html', 'w')
  150. f.write(str(content_soup))
  151. f.close()
  152. sleep(1.3)
  153. # Create new feed
  154. # Maybe buiding a new feed from scretch using a template would be nicer but ...
  155. # let's just modify the original one!
  156. feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml')
  157. # Exclude items
  158. if obj.get('exclude') and isinstance(obj['exclude'], list):
  159. for e in feed_soup.findAll('item'):
  160. matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()]
  161. if len(matches) > 0:
  162. e.extract()
  163. print('Exclude: ', e.title.text, '->', matches)
  164. for e in feed_soup.findAll('item'):
  165. entry_dir = get_valid_filename(e.link.text)
  166. f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')
  167. content_tag = feed_soup.new_tag('content:encoded')
  168. content_tag.string = CData(f_content.read())
  169. e.append(content_tag)
  170. f_content.close
  171. # create directory if not present
  172. os.makedirs(feeds_path, exist_ok=True)
  173. f = open(feeds_path + '/' + output_filename, 'w')
  174. f.write(str(feed_soup))
  175. f.close()
  176. # Let's actually fetch the stuff!
  177. for feed in config['feeds']:
  178. process_feed(feed)