add string exclusion

6 年前 · 309f78e1cb
--- a/config.example.json
+++ b/config.example.json
@@ -3,7 +3,8 @@
  "feeds" : [
    {
      "source" : "https://a.newspaper.tld/news.xml",
      "destination": "newspaper.xml"
      "destination": "newspaper.xml",
      "exclude": ["10 most", "hate", "?!", "click", "horrific", "will shock you"]
    },
    {
      "source" : "https://another.newspaper.tld/rss",
--- a/feedcake.py
+++ b/feedcake.py
@@ -28,7 +28,6 @@ except:
    print("ERROR: Config file not found or invalid!")
    sys.exit(1)

 print(filedir)
 public_path = filedir + '/public'
 assets_path = public_path + '/assets'
 feeds_path = public_path + '/feeds'
@@ -79,7 +78,11 @@ def download_image(url, entry_dir, filename):
            for chunk in response.iter_content(1024):
                f.write(chunk)

 def process_feed(feed_url, output_filename):
 def process_feed(obj):
    feed_url = obj['source']
    output_filename = obj['destination']

    print('Updating:', obj['destination'])

    # Get the feed
    r_feed = requests.get(feed_url, headers=requestheaders)
@@ -88,10 +91,10 @@ def process_feed(feed_url, output_filename):

    # Store data of new articles
    for entry in feedparser.parse(r_feed.text).entries:
        print(entry.link)
        entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
        entry_path = assets_path + '/'+ entry_dir
        if not os.path.exists(entry_path):
            print('New item: ', entry.link)
            r = requests.get(entry.link.split('?')[0], headers=requestheaders)

            online_soup = BeautifulSoup(r.text, 'html.parser')
@@ -192,6 +195,14 @@ def process_feed(feed_url, output_filename):

    feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml')

    # Exclude items
    if obj.get('exclude') and isinstance(obj['exclude'], list):
        for e in feed_soup.findAll('item'):
            matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()]
            if len(matches) > 0:
                e.extract()
                print('Exclude: ', e.title.text, '->', matches)

    for e in feed_soup.findAll('item'):
        entry_dir = get_valid_filename(e.link.text)
        f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')
@@ -213,4 +224,4 @@ def process_feed(feed_url, output_filename):
 # Let's actually fetch the stuff!

 for feed in config['feeds']:
    process_feed(feed['source'], feed['destination'])
    process_feed(feed)