|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229 |
- # This script is intended for personal and scientific use only
-
- import os
- import sys
- import re
- import hashlib
- import json
- import time
- import feedparser
- import requests
- from bs4 import BeautifulSoup, Comment
- from bs4.element import CData
-
-
- def timestamp():
- return '[' + time.strftime("%d/%b/%Y:%H:%M:%S %z", time.localtime()) + ']'
-
-
- # default config location is a 'config.json' next to the script.
- try:
- filedir = os.path.dirname(os.path.abspath(__file__))
- if len(sys.argv) < 2:
- configpath = filedir+'/config.json'
- print(timestamp(), "Using default config location: ", configpath)
- config = json.load(open(configpath))
- else:
- configpath = sys.argv[1]
- config = json.load(open(configpath))
-
- except:
- print(timestamp(), "Problem reading config file: ", configpath)
- print(timestamp(), "ERROR: Config file not found or invalid!")
- sys.exit(1)
-
- # File location of feeds and assets are in directories next to the script
- public_path = filedir + '/public'
- assets_path = public_path + '/assets'
- feeds_path = public_path + '/feeds'
-
- # e.g. https://example.com/some-string
- assets_url = config['assets_url']
-
- requestheaders = config['request_headers']
-
- # need filname safe strings for storing images along html files
- def get_valid_filename(s):
- s = str(s).split('?')[0].strip().strip('/').strip('http://').strip('https://').replace(' ', '-')
- return re.sub(r'(?u)[^-\w.]', '-', s)
-
- # Get a unique and valid filename from URL (for images)
- def filename_from_url(url):
- # remove get attributes and path
- new_filename = url.split('?')[0].split('/')[-1]
- # Split filename
- new_filename = new_filename.split('.')
- # insert a hash before suffix
- new_filename.insert(1, str(hashlib.md5(url.encode('utf-8')).hexdigest()) )
- # convert back to string and extra validate
- new_filename = get_valid_filename('.'.join(new_filename))
- return new_filename
-
- # Download images and so on
- def download_image(url, entry_dir, filename):
- # take care of protocol relative URLs ... let's just assume that https works.
- if url.startswith('//'):
- url = 'https:'+url
- response = requests.get(url, headers=requestheaders)
- if response.status_code == 200:
- with open(assets_path + '/' + entry_dir + '/' + filename, 'wb') as f:
- #f.write(response.content)
- for chunk in response.iter_content(1024):
- f.write(chunk)
-
- # process the feed entries, specified in the config
- def process_feed(obj):
- feed_url = obj['source']
- output_filename = obj['destination']
-
- print(timestamp(), 'Updating:', obj['destination'])
-
- # Step 1: Get the feed
- r_feed = requests.get(feed_url, headers=requestheaders)
- # TODO: exceptions.(what if 404 or whatever?)
-
-
- # Step 2: Scrape and store data of new articles
- for entry in feedparser.parse(r_feed.text).entries:
- entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
- entry_path = assets_path + '/'+ entry_dir
- if not os.path.exists(entry_path):
- print(timestamp(), 'New item:', entry.link)
- r = requests.get(entry.link.split('?')[0], headers=requestheaders)
-
- online_soup = BeautifulSoup(r.text, 'html.parser')
-
- content_soup = BeautifulSoup('<div></div>', 'html.parser')
-
- # Remove all Comments
- for element in online_soup(text=lambda text: isinstance(text, Comment)):
- element.extract()
-
- # domain and path specific rules
- # ... ob+fu+sca+tion for seo
-
- if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'):
- if entry.date:
- article_time = content_soup.new_tag('time', datetime=entry.date)
- content_soup.div.append(article_time)
- article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'})
- content_soup.div.append(article_headline)
- article_body = online_soup.find('div', attrs={'class': 'story-content'})
- content_soup.div.append(article_body)
-
- # Add a link to original article
- article_source = content_soup.new_tag('a', href=entry.link)
- article_source['class'] = 'source'
- article_source.string = 'Quelle: ' + entry.link
- content_soup.div.append(article_source)
-
- if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :)
- if entry.published:
- article_time = content_soup.new_tag('time', datetime=entry.published)
- content_soup.div.append(article_time)
- article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'})
- content_soup.div.append(article_headline)
- # images etc
- article_aside = online_soup.find('div', id="content-aside")
- content_soup.div.append(article_aside)
- article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'})
- content_soup.div.append(article_body)
-
- # Add a link to comments
- # modify original link -> mobile version and comment section
- link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#'
- article_comments_link = content_soup.new_tag('a', href=link_to_comments)
- article_comments_link['class'] = 'comments'
- article_comments_p = content_soup.new_tag('p')
- article_comments_link.string = 'Kommentare'
- article_comments_p.append(article_comments_link)
- content_soup.div.append(article_comments_p)
-
- # Add a link to original article
- article_source = content_soup.new_tag('a', href=entry.link.split('?')[0])
- article_source['class'] = 'source'
- article_source.string = 'Quelle: ' + entry.link.split('?')[0]
- article_source_p = content_soup.new_tag('p')
- article_source_p.append(article_source)
- content_soup.div.append(article_source_p)
-
- # create directory for storing and serving html and images
- os.makedirs(entry_path)
-
- # download all article images and replace image source
- for img in content_soup.findAll('img'):
- if img.get('data-src'):
- old_url = img['data-src']
- if not old_url.startswith('data:'):
- new_filename = filename_from_url(old_url)
- img['data-src'] = assets_url + '/' + entry_dir + '/' + new_filename
- download_image(old_url, entry_dir, new_filename)
- if img.get('src'):
- old_url = img['src']
- if not old_url.startswith('data:'):
- new_filename = filename_from_url(old_url)
- img['src'] = assets_url + '/' + entry_dir + '/' + new_filename
- download_image(old_url, entry_dir, new_filename)
- if img.get('data-srcset'):
- srcset = img['data-srcset'].split(', ')
- new_srcset = []
- for src in srcset:
- old_url = src.split(' ')[0]
- src_res = src.split(' ')[1]
- new_filename = filename_from_url(old_url)
- download_image(old_url, entry_dir, new_filename)
- new_url = assets_url + '/' + entry_dir + '/' + new_filename
- src = ' '.join([new_url, src_res])
- new_srcset.append(src)
- img['data-srcset'] = ', '.join(new_srcset)
-
- # TODO(?): HTML5 picture tag
-
- # Save HTML
- f = open(entry_path + '/index.html', 'w')
- f.write(str(content_soup))
- f.close()
-
- # Wait a bit
- time.sleep(1.3)
-
-
- # Step 3: Create 'new' feed
-
- # Maybe buiding a new feed from scretch using a template would be nicer but ...
- # let's just modify the original one!
-
- feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml')
-
- # Exclude items
- if obj.get('exclude') and isinstance(obj['exclude'], list):
- for e in feed_soup.findAll('item'):
- matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()]
- if len(matches) > 0:
- e.extract()
- print(timestamp(), 'Exclude: ', e.title.text, '->', matches)
-
- # Add content
- for e in feed_soup.findAll('item'):
- entry_dir = get_valid_filename(e.link.text)
- f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')
- content_tag = feed_soup.new_tag('content:encoded')
- content_tag.string = CData(f_content.read())
- e.append(content_tag)
- f_content.close
-
- # create directory if not present
- os.makedirs(feeds_path, exist_ok=True)
-
- f = open(feeds_path + '/' + output_filename, 'w')
- print(timestamp(), 'Done!')
- f.write(str(feed_soup))
- f.close()
-
-
-
-
- # Let's actually fetch the stuff!
-
- for feed in config['feeds']:
- process_feed(feed)
|