added soup.io, added dry run, changed cli, added config file

7 years ago · 35a5e3b7f8
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
 *.secret
 cron.sh

 config.json
--- a/README.md
+++ b/README.md
@@ -1,21 +1,23 @@
 # TootBot
 # TootBot Spicy Edition

 A small python 3.x script to replicate tweets on a mastodon account.
 A small python 3.x script to replicate tweets and posts from soup.io on a mastodon account.

 The script only need mastodon login/pass to post toots.

 It gets the tweets from RSS available at http://twitrss.me, then does some cleanup on the content:
 For Twitter, it gets the tweets from RSS available at http://twitrss.me, then does some cleanup on the content:
 - twitter tracking links (t.co) are dereferenced
 - twitter hosted pictures are retrieved and uploaded to mastodon

 A sqlite database is used to keep track of tweets than have been tooted.
 For Soup, it uses the official RSS feed (e.g. https://metalab.soup.io/rss), then does some cleanup on the content:
 - checks if source is twitter and checks for duplicates
 - removes html stuff
 - looks for one picture in source link or in soup post and uploads it to mastodon
 - adds source url (if no source is specified, soup post is source url)


 This script is in use for a few accounts:
 - cq94 -> https://amicale.net/@cquest
 - Etalab -> https://mastodon.etalab.gouv.fr/@etalab
 - datagouvfr -> https://mastodon.etalab.gouv.fr/@datagouvfr
 - osm_fr -> https://fr.osm.social/@osm_fr
 - sotmfr -> https://fr.osm.social/@sotmfr
 A sqlite database is used to keep track of items than have been tooted.


 this script is based on TootBot: https://github.com/cquest/tootbot

 The script is simply called by a cron job and can run on any server (does not have to be on the mastodon instance server).
--- a/config-example.json
+++ b/config-example.json
@@ -0,0 +1,16 @@
 {
  "mastodon": {
    "instance": "mastodon.social",
    "user": "loginemail@example.com",
    "password": "correcthorsebatterystaple"
  },
  "sources": {
    "twitter": "metalabVie",
    "soup": "metalab.soup.io"
  },
  "settings": {
    "days": 1,
    "dryrun": true,
    "databasefilepath": "tootbot.db"
  }
 }
--- a/cron-example.sh
+++ b/cron-example.sh
@@ -0,0 +1,5 @@
 # activate virtualenv if necessary
 # source /home/cquest/.virtualenvs/tootbot/bin/activate

 python tootbot.py config.json
 python tootbot.py anotherconfig.json
--- a/cron-sample.sh
+++ b/cron-sample.sh
@@ -1,11 +0,0 @@
 # activate virtualenv if necessary
 # source /home/cquest/.virtualenvs/tootbot/bin/activate

 # parameters:
 # 1- twitter account to clone
 # 2- mastodon login
 # 3- mastodon password
 # 4- instance domain (https:// is automatically added)

 python tootbot.py geonym_fr geonym@amicale.net **password** test.amicale.net
 python tootbot.py cq94 cquest@amicale.net **password** test.amicale.net
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 feedparser
 Mastodon.py
 requests

 html2text
--- a/tootbot.py
+++ b/tootbot.py
@@ -6,80 +6,125 @@ import json
 import requests
 import re
 import sqlite3
 import html2text
 from datetime import datetime, date, time, timedelta

 if len(sys.argv) < 4:
    print("Usage: python3 tootbot.py twitter_account mastodon_login mastodon_passwd mastodon_instance")


 # default config location is a 'config.json' next to the script.
 try:
    filedir = os.path.dirname(os.path.abspath(__file__))
    if len(sys.argv) < 2:
        print("Using default config location:  %s/config.json" % filedir)
        config = json.load(open(filedir+'/config.json'))
    else:
        config = json.load(open(sys.argv[1]))

 except:
    print("ERROR: Config file not found!")
    sys.exit(1)


 mastinstance = config['mastodon']['instance']
 mastuser = config['mastodon']['user']
 mastpasswd = config['mastodon']['password']

 twitter = config['sources']['twitter']
 soup = config['sources']['soup']

 dryrun = config['settings']['dryrun']
 days = config['settings']['days']


 # sqlite db to store processed tweets (and corresponding toots ids)
 sql = sqlite3.connect('tootbot.db')
 sql = sqlite3.connect(config['settings']['databasefilepath'])
 db = sql.cursor()
 db.execute('''CREATE TABLE IF NOT EXISTS tweets (tweet text, toot text, twitter text, mastodon text, instance text)''')
 db.execute('''CREATE TABLE IF NOT EXISTS posts (srcpost text, srcuser text, mastpost text, mastuser text, mastinstance text)''')

 mastodon_api = None

 def register_app(mastuser,mastpasswd,mastinstance,mastodon_api):
    if mastodon_api is None:
        if not os.path.isfile(mastinstance+'.secret'):
            if Mastodon.create_app(
                'metasyndicator',
                api_base_url='https://'+mastinstance,
                to_file = mastinstance+'.secret'
            ):
                print('app created on instance '+mastinstance)
            else:
                print('failed to create app on instance '+mastinstance)
                sys.exit(1)

    try:
        mastodon_api = Mastodon(
          client_id=mastinstance+'.secret',
          api_base_url='https://'+mastinstance
        )
        mastodon_api.log_in(
            username=mastuser,
            password=mastpasswd,
            scopes=['read', 'write'],
            to_file=mastuser+".secret"
        )
        return mastodon_api
    except:
        print("ERROR: First Login Failed!")
        sys.exit(1)

 if len(sys.argv)>4:
    instance = sys.argv[4]
 else:
    instance = 'amicale.net'

 if len(sys.argv)>5:
    days = int(sys.argv[5])
 else:
    days = 1

 twitter = sys.argv[1]
 mastodon = sys.argv[2]
 passwd = sys.argv[3]

 mastodon_api = None

 d = feedparser.parse('http://twitrss.me/twitter_user_to_rss/?user='+twitter)
 # twitter section

 for t in reversed(d.entries):
 print('====== TWITTER ======')

 t = feedparser.parse('http://twitrss.me/twitter_user_to_rss/?user='+twitter)

 for p in reversed(t.entries):
    # check if this tweet has been processed
    db.execute('SELECT * FROM tweets WHERE tweet = ? AND twitter = ?  and mastodon = ? and instance = ?',(t.id, twitter, mastodon, instance))
    db.execute(
        'SELECT * FROM posts WHERE srcpost = ? AND srcuser = ?  AND mastuser = ? AND mastinstance = ?',
        (p.id, twitter, mastuser, mastinstance)
    )
    last = db.fetchone()

    # process only unprocessed tweets less than 1 day old
    if last is None and (datetime.now()-datetime(t.published_parsed.tm_year, t.published_parsed.tm_mon, t.published_parsed.tm_mday, t.published_parsed.tm_hour, t.published_parsed.tm_min, t.published_parsed.tm_sec) < timedelta(days=days)):
        if mastodon_api is None:
            # Create application if it does not exist
            if not os.path.isfile(instance+'.secret'):
                if Mastodon.create_app(
                    'tootbot',
                    api_base_url='https://'+instance,
                    to_file = instance+'.secret'
                ):
                    print('tootbot app created on instance '+instance)
                else:
                    print('failed to create app on instance '+instance)
                    sys.exit(1)
    shouldpost = True
    posttime = datetime(p.published_parsed.tm_year, p.published_parsed.tm_mon, p.published_parsed.tm_mday, p.published_parsed.tm_hour, p.published_parsed.tm_min, p.published_parsed.tm_sec)
    if last is not None:
        shouldpost = False
        print("skip: already posted")
    # process only unprocessed tweets less than n days old
    if datetime.now() - posttime > timedelta(days=days):
        shouldpost = False
        print("skip: Post too old")
    # kill tweets with fb links with fire!
    if "https://www.facebook.com" in p.title or "https://m.facebook.com" in p.title:
        shouldpost = False
        print("skip: a Tweet that links to facebook? ... That's too much.")


            try:
                mastodon_api = Mastodon(
                  client_id=instance+'.secret',
                  api_base_url='https://'+instance
                )
                mastodon_api.log_in(
                    username=mastodon,
                    password=passwd,
                    scopes=['read', 'write'],
                    to_file=mastodon+".secret"
                )
            except:
                print("ERROR: First Login Failed!")
                sys.exit(1)

        #h = BeautifulSoup(t.summary_detail.value, "html.parser")
        c = t.title
        if t.author != '(%s)' % twitter:
            c = ("RT %s\n" % t.author[1:-1]) + c
    if shouldpost:
        print(posttime)
        # Create application if it does not exist
        mastodon_api = register_app(mastuser, mastpasswd, mastinstance, mastodon_api)

        c = p.title

        if p.author != '(%s)' % twitter:
            c = ("RT %s\n" % p.author[1:-1]) + c
        toot_media = []
        # get the pictures...
        for p in re.finditer(r"https://pbs.twimg.com/[^ \xa0\"]*", t.summary):
            media = requests.get(p.group(0))
            media_posted = mastodon_api.media_post(media.content, mime_type=media.headers.get('content-type'))
            toot_media.append(media_posted['id'])
        for pic in re.finditer(r"https://pbs.twimg.com/[^ \xa0\"]*", p.summary):
            if (not dryrun):
                media = requests.get(pic.group(0))
                media_posted = mastodon_api.media_post(media.content, mime_type=media.headers.get('content-type'))
                toot_media.append(media_posted['id'])
                media = None
            else:
                print('Dryrun: not fetching ', pic.group(0), ' and not uploading it to mastodon')

        # replace t.co link by original URL
        m = re.search(r"http[^ \xa0]*", c)
@@ -97,10 +142,130 @@ for t in reversed(d.entries):

        # remove ellipsis
        c = c.replace('\xa0…',' ')
        print(c)

        if toot_media is not None:
            toot = mastodon_api.status_post(c, in_reply_to_id=None, media_ids=toot_media, sensitive=False, visibility='public', spoiler_text=None)
            if "id" in toot:
                db.execute("INSERT INTO tweets VALUES ( ? , ? , ? , ? , ? )",
                (t.id, toot["id"], twitter, mastodon, instance))
        if (not dryrun):
            toot = mastodon_api.status_post(c, in_reply_to_id=None, media_ids=toot_media, sensitive=False, visibility='unlisted', spoiler_text=None)
            print( '--> toot posted!')
            try:
                db.execute("INSERT INTO posts VALUES ( ? , ? , ? , ? , ? )", (p.id, twitter, toot.id, mastuser, mastinstance))
                sql.commit()
            except:
                print('database execution failed.')
                print('p.id: ', p.id)
                print('toot.id: ', toot.id)

        else:
            print('Dryrun: not posting toot and not adding it to database')
        print('------------------------')



 # soup.io section

 print('====== SOUP ======')

 h = html2text.HTML2Text()
 h.ignore_links = True
 h.ignore_images = True
 h.body_width = 0

 s = feedparser.parse('http://'+soup+'/rss')

 for p in reversed(s.entries):
    # check if this tweet has been processed
    db.execute(
        'SELECT * FROM posts WHERE srcpost = ? AND srcuser = ?  AND mastuser = ? AND mastinstance = ?',
        (p.id, soup, mastuser, mastinstance)
    )
    last = db.fetchone()

    # process only unprocessed post less than n days old
    posttime = datetime(p.published_parsed.tm_year, p.published_parsed.tm_mon, p.published_parsed.tm_mday, p.published_parsed.tm_hour, p.published_parsed.tm_min, p.published_parsed.tm_sec)
    if last is None and (datetime.now() - posttime < timedelta(days=days)):
        # Create application if it does not exist
        mastodon_api = register_app(mastuser, mastpasswd, mastinstance, mastodon_api)

        print(p.link)
        j = json.loads(p.soup_attributes)

        # get status id and user if twitter is source
        twitterstatus = None
        twitteruser = None
        if (isinstance(j['source'], str)):
            if ( j['source'].startswith('https://twitter.com/') or j['source'].startswith('https://mobile.twitter.com/')):
                twitterurl = j['source'].split('/')
                twitteruser = twitterurl[3]
                if ( twitterurl[4] == 'status'):
                    twitterstatus = twitterurl[5]

        # get all tweeted statuses
        db.execute('SELECT srcpost FROM posts where srcuser = ?', (twitter,))
        postedtweets = []
        for postedtweet in db.fetchall():
            postedtweets.append(postedtweet[0].split('/')[-1])

        # check if already tweeted
        if twitterstatus in postedtweets:
            print('Already tweeted: ', j['source'])

        else:
            # collect information about images
            pics = []
            accepted_filetypes = ('.jpg', '.jpeg', '.png', '.webm', '.JPG', '.JPEG', '.PNG', '.WEBM') # let's don't do mp4 for now.
            if (isinstance(j['source'], str) and j['source'].endswith(accepted_filetypes) ):
                pics.append(j['source'])
            elif ( 'url' in j and isinstance(j['url'], str) and j['url'].endswith(accepted_filetypes) ):
                pics.append(j['url'])

            # get the images and post them to mastadon ...
            toot_media = []
            for pic in pics:
                if (not dryrun):
                    media = requests.get(pic)
                    print(pic, ' has mimetype ', media.headers.get('content-type'))
                    media_posted = mastodon_api.media_post(media.content, mime_type=media.headers.get('content-type'))
                    toot_media.append(media_posted['id'])
                else:
                    print('Dryrun: not fetching ', pic, ' and not uploading it to mastodon')


            # remove all html stuff - python module in use only supports markdown, not pure plaintext
            textsrc = h.handle(p.summary_detail.value.replace("<small>", "<br><small>"))
            # free text from lines without visible characters
            cleantextsrc = ''
            for line in textsrc.split('\n'):
                line = line.strip()
                cleantextsrc += line + '\n'

            # strip newlines, reduce newlines, remove markdown bold (i know, ugly), do some clean up
            text = cleantextsrc.strip('\n').replace('\n\n\n','\n\n').replace('**','').replace('\\--','')

            # link directly to source or use soup as source.
            if (isinstance(j['source'], str) and j['source'] not in text):
                source = '\n\nSource: ' + j['source']
            else:
                source = '\n\nSource: ' + p.link

            # shorten text if too long
            maximumlegth = 500 - 1 - len(source) - 50 # 50 ... just in case (if they also count attachement url and so on)
            text = (text[:maximumlegth] + '…') if len(text) > maximumlegth else text

            # add source
            text += source

            print(text)

            if (not dryrun):
                # post toot
                toot = mastodon_api.status_post(text, in_reply_to_id=None, media_ids=toot_media, sensitive=False, visibility='public', spoiler_text=None)

                # add entry to database
                if "id" in toot:
                    db.execute("INSERT INTO posts VALUES ( ? , ? , ? , ? , ? )", (p.id, soup, toot.id, mastuser, mastinstance))
                    sql.commit()
                    print( '--> ',  p.id, ' posted!')
            else:
                print('Dryrun: not posting toot and not adding it to database')

        print('------------------------')