|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- # This script is intended for personal and scientific use only
-
- import os
- import sys
- import re
- import hashlib
- import json
- from time import sleep
- import feedparser
- import requests
- from bs4 import BeautifulSoup, Comment
- from bs4.element import CData
-
-
- # default config location is a 'config.json' next to the script.
- try:
- filedir = os.path.dirname(os.path.abspath(__file__))
- if len(sys.argv) < 2:
- configpath = filedir+'/config.json'
- print("Using default config location: ", configpath)
- config = json.load(open(configpath))
- else:
- configpath = sys.argv[1]
- config = json.load(open(configpath))
-
- except:
- print("Problem reading config file: ", configpath)
- print("ERROR: Config file not found or invalid!")
- sys.exit(1)
-
- print(filedir)
- public_path = filedir + '/public'
- assets_path = public_path + '/assets'
-
- # e.g. https://example.com/some-string
- base_url = config['base_url']
-
-
-
-
- # "I'm a robot which promises you clicks and $ ... Give me ALL your content!"
- requestheaders = {
- 'user-'+'age'+'nt' :
- 'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go'
- + 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go'
- + 'og'+ 'le'+'.com/'+'bo'+'t.html)'
- }
-
-
-
-
- # need filname safe strings for storing images along html files
- def get_valid_filename(s):
- s = str(s).split('?')[0].strip().strip('/').strip('http://').strip('https://').replace(' ', '-')
- return re.sub(r'(?u)[^-\w.]', '-', s)
-
- # Get a unique and valid filename from URL (for images)
- def filename_from_url(url):
- # remove get attributes and path
- new_filename = url.split('?')[0].split('/')[-1]
- # Split filename
- new_filename = new_filename.split('.')
- # insert a hash before suffix
- new_filename.insert(1, str(hashlib.md5(url.encode('utf-8')).hexdigest()) )
- # convert back to string and extra validate
- new_filename = get_valid_filename('.'.join(new_filename))
- return new_filename
-
- # Download images and so on
- def download_image(url, entry_dir, filename):
- # take care of protocol relative URLs ... let's just assume that https works.
- if url.startswith('//'):
- url = 'https:'+url
- response = requests.get(url, headers=requestheaders)
- if response.status_code == 200:
- with open(assets_path + '/' + entry_dir + '/' + filename, 'wb') as f:
- #f.write(response.content)
- for chunk in response.iter_content(1024):
- f.write(chunk)
-
- def process_feed(feed_url, output_filename):
-
- # Get the feed
- r_feed = requests.get(feed_url, headers=requestheaders)
-
- # TODO: exceptions.(what if 404 or whatever?)
-
- # Store data of new articles
- for entry in feedparser.parse(r_feed.text).entries:
- print(entry.link)
- entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
- entry_path = assets_path + '/'+ entry_dir
- if not os.path.exists(entry_path):
- r = requests.get(entry.link.split('?')[0], headers=requestheaders)
-
- online_soup = BeautifulSoup(r.text, 'html.parser')
-
- content_soup = BeautifulSoup('<article></article>', 'html.parser')
-
- # Remove all Comments
- for element in online_soup(text=lambda text: isinstance(text, Comment)):
- element.extract()
-
- # domain and path specific rules
- # ... split strings for (very simple) ob+fu+sca+tion
-
- if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'):
- if entry.date:
- article_time = content_soup.new_tag('time', datetime=entry.date)
- content_soup.article.append(article_time)
- article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'})
- content_soup.article.append(article_headline)
- article_body = online_soup.find('div', attrs={'class': 'story-content'})
- content_soup.article.append(article_body)
- article_link = content_soup.new_tag('a', href=entry.link)
- article_link['class'] = 'source';
- article_link.string = 'Quelle (' + entry.link + ')'
- content_soup.article.append(article_link)
-
- if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :)
- if entry.published:
- article_time = content_soup.new_tag('time', datetime=entry.published)
- content_soup.article.append(article_time)
- article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'})
- content_soup.article.append(article_headline)
- # images etc
- article_aside = online_soup.find('div', id="content-aside")
- content_soup.article.append(article_aside)
- article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'})
- content_soup.article.append(article_body)
- article_link = content_soup.new_tag('a', href=entry.link)
- article_link['class'] = 'source';
- article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')'
- content_soup.article.append(article_link)
-
- # modify original link -> mobile version and comment section
- link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#'
- article_comments_link = content_soup.new_tag('a', href=link_to_comments)
- article_comments_link['class'] = 'comments';
- article_comments_link.sting = 'Kommentare'
- content_soup.article.append(article_comments_link)
-
- article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')'
- content_soup.article.append(article_link)
-
-
- # create directory for storing and serving html and images
- os.makedirs(entry_path)
-
- # download all article images and replace image source
- for img in content_soup.findAll('img'):
- print(img)
- if img.get('data-src'):
- old_url = img['data-src']
- if not old_url.startswith('data:'):
- new_filename = filename_from_url(old_url)
- img['data-src'] = base_url + '/' + entry_dir + '/' + new_filename
- download_image(old_url, entry_dir, new_filename)
- if img.get('src'):
- old_url = img['src']
- if not old_url.startswith('data:'):
- new_filename = filename_from_url(old_url)
- img['src'] = base_url + '/' + entry_dir + '/' + new_filename
- download_image(old_url, entry_dir, new_filename)
- if img.get('data-srcset'):
- srcset = img['data-srcset'].split(', ')
- new_srcset = []
- for src in srcset:
- old_url = src.split(' ')[0]
- src_res = src.split(' ')[1]
- new_filename = filename_from_url(old_url)
- download_image(old_url, entry_dir, new_filename)
- new_url = base_url + '/' + entry_dir + '/' + new_filename
- src = ' '.join([new_url, src_res])
- new_srcset.append(src)
- img['data-srcset'] = ', '.join(new_srcset)
-
- # TODO(?): HTML5 picture tag
-
-
- f = open(entry_path + '/index.html', 'w')
- f.write(str(content_soup.prettify()))
- f.close()
-
- sleep(1.3)
-
-
- # Create new feed
-
- # Maybe buiding a new feed from scretch using a template would be nicer but ...
- # let's just modify the original one!
-
- feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml')
-
- for e in feed_soup.findAll('item'):
- entry_dir = get_valid_filename(e.link.text)
- f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')
- content_tag = feed_soup.new_tag('content:encoded')
- content_tag.string = CData(f_content.read())
- e.append(content_tag)
- f_content.close
-
- f = open(public_path + '/' + output_filename, 'w')
- f.write(str(feed_soup.prettify()))
- f.close()
-
-
-
-
- # Let's actually fetch the stuff!
-
- for feed in config['feeds']:
- process_feed(feed['source'], feed['destination'])
|