You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

230 lines
9.1 KiB

  1. # This script is intended for personal and scientific use only
  2. import os
  3. import sys
  4. import re
  5. import hashlib
  6. import json
  7. import time
  8. import feedparser
  9. import requests
  10. from bs4 import BeautifulSoup, Comment
  11. from bs4.element import CData
  12. def timestamp():
  13. return '[' + time.strftime("%d/%b/%Y:%H:%M:%S %z", time.localtime()) + ']'
  14. # default config location is a 'config.json' next to the script.
  15. try:
  16. filedir = os.path.dirname(os.path.abspath(__file__))
  17. if len(sys.argv) < 2:
  18. configpath = filedir+'/config.json'
  19. print(timestamp(), "Using default config location: ", configpath)
  20. config = json.load(open(configpath))
  21. else:
  22. configpath = sys.argv[1]
  23. config = json.load(open(configpath))
  24. except:
  25. print(timestamp(), "Problem reading config file: ", configpath)
  26. print(timestamp(), "ERROR: Config file not found or invalid!")
  27. sys.exit(1)
  28. # File location of feeds and assets are in directories next to the script
  29. public_path = filedir + '/public'
  30. assets_path = public_path + '/assets'
  31. feeds_path = public_path + '/feeds'
  32. # e.g. https://example.com/some-string
  33. assets_url = config['assets_url']
  34. requestheaders = config['request_headers']
  35. # need filname safe strings for storing images along html files
  36. def get_valid_filename(s):
  37. s = str(s).split('?')[0].strip().strip('/').strip('http://').strip('https://').replace(' ', '-')
  38. return re.sub(r'(?u)[^-\w.]', '-', s)
  39. # Get a unique and valid filename from URL (for images)
  40. def filename_from_url(url):
  41. # remove get attributes and path
  42. new_filename = url.split('?')[0].split('/')[-1]
  43. # Split filename
  44. new_filename = new_filename.split('.')
  45. # insert a hash before suffix
  46. new_filename.insert(1, str(hashlib.md5(url.encode('utf-8')).hexdigest()) )
  47. # convert back to string and extra validate
  48. new_filename = get_valid_filename('.'.join(new_filename))
  49. return new_filename
  50. # Download images and so on
  51. def download_image(url, entry_dir, filename):
  52. # take care of protocol relative URLs ... let's just assume that https works.
  53. if url.startswith('//'):
  54. url = 'https:'+url
  55. response = requests.get(url, headers=requestheaders)
  56. if response.status_code == 200:
  57. with open(assets_path + '/' + entry_dir + '/' + filename, 'wb') as f:
  58. #f.write(response.content)
  59. for chunk in response.iter_content(1024):
  60. f.write(chunk)
  61. # process the feed entries, specified in the config
  62. def process_feed(obj):
  63. feed_url = obj['source']
  64. output_filename = obj['destination']
  65. print(timestamp(), 'Updating:', obj['destination'])
  66. # Step 1: Get the feed
  67. r_feed = requests.get(feed_url, headers=requestheaders)
  68. # TODO: exceptions.(what if 404 or whatever?)
  69. # Step 2: Scrape and store data of new articles
  70. for entry in feedparser.parse(r_feed.text).entries:
  71. entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
  72. entry_path = assets_path + '/'+ entry_dir
  73. if not os.path.exists(entry_path):
  74. print(timestamp(), 'New item:', entry.link)
  75. r = requests.get(entry.link.split('?')[0], headers=requestheaders)
  76. online_soup = BeautifulSoup(r.text, 'html.parser')
  77. content_soup = BeautifulSoup('<div></div>', 'html.parser')
  78. # Remove all Comments
  79. for element in online_soup(text=lambda text: isinstance(text, Comment)):
  80. element.extract()
  81. # domain and path specific rules
  82. # ... ob+fu+sca+tion for seo
  83. if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'):
  84. if entry.date:
  85. article_time = content_soup.new_tag('time', datetime=entry.date)
  86. content_soup.div.append(article_time)
  87. article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'})
  88. content_soup.div.append(article_headline)
  89. article_body = online_soup.find('div', attrs={'class': 'story-content'})
  90. content_soup.div.append(article_body)
  91. # Add a link to original article
  92. article_source = content_soup.new_tag('a', href=entry.link)
  93. article_source['class'] = 'source'
  94. article_source.string = 'Quelle: ' + entry.link
  95. content_soup.div.append(article_source)
  96. if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :)
  97. if entry.published:
  98. article_time = content_soup.new_tag('time', datetime=entry.published)
  99. content_soup.div.append(article_time)
  100. article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'})
  101. content_soup.div.append(article_headline)
  102. # images etc
  103. article_aside = online_soup.find('div', id="content-aside")
  104. content_soup.div.append(article_aside)
  105. article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'})
  106. content_soup.div.append(article_body)
  107. # Add a link to comments
  108. # modify original link -> mobile version and comment section
  109. link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#'
  110. article_comments_link = content_soup.new_tag('a', href=link_to_comments)
  111. article_comments_link['class'] = 'comments'
  112. article_comments_p = content_soup.new_tag('p')
  113. article_comments_link.string = 'Kommentare'
  114. article_comments_p.append(article_comments_link)
  115. content_soup.div.append(article_comments_p)
  116. # Add a link to original article
  117. article_source = content_soup.new_tag('a', href=entry.link.split('?')[0])
  118. article_source['class'] = 'source'
  119. article_source.string = 'Quelle: ' + entry.link.split('?')[0]
  120. article_source_p = content_soup.new_tag('p')
  121. article_source_p.append(article_source)
  122. content_soup.div.append(article_source_p)
  123. # create directory for storing and serving html and images
  124. os.makedirs(entry_path)
  125. # download all article images and replace image source
  126. for img in content_soup.findAll('img'):
  127. if img.get('data-src'):
  128. old_url = img['data-src']
  129. if not old_url.startswith('data:'):
  130. new_filename = filename_from_url(old_url)
  131. img['data-src'] = assets_url + '/' + entry_dir + '/' + new_filename
  132. download_image(old_url, entry_dir, new_filename)
  133. if img.get('src'):
  134. old_url = img['src']
  135. if not old_url.startswith('data:'):
  136. new_filename = filename_from_url(old_url)
  137. img['src'] = assets_url + '/' + entry_dir + '/' + new_filename
  138. download_image(old_url, entry_dir, new_filename)
  139. if img.get('data-srcset'):
  140. srcset = img['data-srcset'].split(', ')
  141. new_srcset = []
  142. for src in srcset:
  143. old_url = src.split(' ')[0]
  144. src_res = src.split(' ')[1]
  145. new_filename = filename_from_url(old_url)
  146. download_image(old_url, entry_dir, new_filename)
  147. new_url = assets_url + '/' + entry_dir + '/' + new_filename
  148. src = ' '.join([new_url, src_res])
  149. new_srcset.append(src)
  150. img['data-srcset'] = ', '.join(new_srcset)
  151. # TODO(?): HTML5 picture tag
  152. # Save HTML
  153. f = open(entry_path + '/index.html', 'w')
  154. f.write(str(content_soup))
  155. f.close()
  156. # Wait a bit
  157. time.sleep(1.3)
  158. # Step 3: Create 'new' feed
  159. # Maybe buiding a new feed from scretch using a template would be nicer but ...
  160. # let's just modify the original one!
  161. feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml')
  162. # Exclude items
  163. if obj.get('exclude') and isinstance(obj['exclude'], list):
  164. for e in feed_soup.findAll('item'):
  165. matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()]
  166. if len(matches) > 0:
  167. e.extract()
  168. print(timestamp(), 'Exclude: ', e.title.text, '->', matches)
  169. # Add content
  170. for e in feed_soup.findAll('item'):
  171. entry_dir = get_valid_filename(e.link.text)
  172. f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')
  173. content_tag = feed_soup.new_tag('content:encoded')
  174. content_tag.string = CData(f_content.read())
  175. e.append(content_tag)
  176. f_content.close
  177. # create directory if not present
  178. os.makedirs(feeds_path, exist_ok=True)
  179. f = open(feeds_path + '/' + output_filename, 'w')
  180. print(timestamp(), 'Done!')
  181. f.write(str(feed_soup))
  182. f.close()
  183. # Let's actually fetch the stuff!
  184. for feed in config['feeds']:
  185. process_feed(feed)