From 2b485e704e188e2fdda1ed84ac8d926f46b8fd03 Mon Sep 17 00:00:00 2001 From: Andreas Demmelbauer Date: Thu, 4 Apr 2019 10:11:37 -0700 Subject: [PATCH] more documentation and comments --- README.md | 5 +++-- feedcake.py | 17 ++++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 34ed0ae..9b904bf 100644 --- a/README.md +++ b/README.md @@ -55,9 +55,10 @@ news pages * add cronjob for `cron.sh`: `crontab -e` * `*/5 * * * * /absolute/path/to/cron.sh >> /path/to/logfile 2>&1` * setup your webserver: - * let your webserver somehow point to the `feeds` directory. + * let your webserver somehow point to the `public/feeds` directory. You should protect the http path with a basic authentication. - * let the `assets_url` specified in the config point to the `assets` directory. + * let the `assets_url` you specified in the config earlier point to the + `public/assets` directory. * After running the script the first time, your desired feed is available at `base_url/destination` (e.g. `https://yourdomain.tld/some-url/newspaper.xml`) diff --git a/feedcake.py b/feedcake.py index 5eadd5f..03743e9 100644 --- a/feedcake.py +++ b/feedcake.py @@ -32,6 +32,7 @@ except: print(timestamp(), "ERROR: Config file not found or invalid!") sys.exit(1) +# File location of feeds and assets are in directories next to the script public_path = filedir + '/public' assets_path = public_path + '/assets' feeds_path = public_path + '/feeds' @@ -70,18 +71,19 @@ def download_image(url, entry_dir, filename): for chunk in response.iter_content(1024): f.write(chunk) +# process the feed entries, specified in the config def process_feed(obj): feed_url = obj['source'] output_filename = obj['destination'] print(timestamp(), 'Updating:', obj['destination']) - # Get the feed + # Step 1: Get the feed r_feed = requests.get(feed_url, headers=requestheaders) - # TODO: exceptions.(what if 404 or whatever?) - # Store data of new articles + + # Step 2: Scrape and store data of new articles for entry in feedparser.parse(r_feed.text).entries: entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ entry_path = assets_path + '/'+ entry_dir @@ -108,7 +110,7 @@ def process_feed(obj): content_soup.div.append(article_headline) article_body = online_soup.find('div', attrs={'class': 'story-content'}) content_soup.div.append(article_body) - + # Add a link to original article article_source = content_soup.new_tag('a', href=entry.link) article_source['class'] = 'source' @@ -145,7 +147,6 @@ def process_feed(obj): article_source_p.append(article_source) content_soup.div.append(article_source_p) - # create directory for storing and serving html and images os.makedirs(entry_path) @@ -178,15 +179,16 @@ def process_feed(obj): # TODO(?): HTML5 picture tag - + # Save HTML f = open(entry_path + '/index.html', 'w') f.write(str(content_soup)) f.close() + # Wait a bit time.sleep(1.3) - # Create new feed + # Step 3: Create 'new' feed # Maybe buiding a new feed from scretch using a template would be nicer but ... # let's just modify the original one! @@ -201,6 +203,7 @@ def process_feed(obj): e.extract() print(timestamp(), 'Exclude: ', e.title.text, '->', matches) + # Add content for e in feed_soup.findAll('item'): entry_dir = get_valid_filename(e.link.text) f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')