GMU:Bots 'n' Plots/Christopher Marx: Difference between revisions

Revision as of 20:41, 3 October 2015

Idiomaggio - a language detection bot

"The limits of my language are the limits of my world." (Ludwig Wittgenstein)

Idiomaggio is a twitter bot that detects the language of tweets and responds automatically in the right tongue. In a multilingual world there can be many different languages surround you. With the help of Idiomaggio you can figure out which language is spoken

The twitter bot is working with the NLTK library; mainly with the built-in function stop words. Linking words, conjunctions, articles and pronouns are words that make a language speakable. However, they don't create the meaning of a text. Idiomaggio filters the stop words out and detects their language. In a second step, the tweet will be responded in the correct language.

[1]

Idiamaggio understands the European languages Swedish, Danish, Hungarian, Finnish, Portugese, German, Dutch, French, Spanish, Norwegian, English, Russian, Turkish and Italian.

The Code

#!/usr/bin/env python2
# -*- coding: utf-8 -*- #

from twitterbot import TwitterBot
import keys

import nltk
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords

class Idiomaggio(TwitterBot):
    def bot_init(self):
        
        """
        Use your own consumer key to make the bot alive.
        """

        ############################
        # REQUIRED: LOGIN DETAILS! #
        ############################

        self.config['api_key'] = keys.consumer_key
        self.config['api_secret'] = keys.consumer_secret
        self.config['access_key'] = keys.access_token
        self.config['access_secret'] = keys.access_token_secret

        ######################################
        # SEMI-OPTIONAL: OTHER CONFIG STUFF! #
        ######################################

        # how often to tweet, in seconds
        self.config['tweet_interval'] = 1 * 10     # default: 30 minutes

        # use this to define a (min, max) random range of how often to tweet
        # e.g., self.config['tweet_interval_range'] = (5*60, 10*60) # tweets every 5-10 minutes
        self.config['tweet_interval_range'] = None

        # only reply to tweets that specifically mention the bot
        self.config['reply_direct_mention_only'] = True

        # only include bot followers (and original tweeter) in @-replies
        self.config['reply_followers_only'] = True

        # fav any tweets that mention this bot?
        self.config['autofav_mentions'] = False

        # fav any tweets containing these keywords?
        self.config['autofav_keywords'] = []

        # follow back all followers?
        self.config['autofollow'] = False

        ###########################################
        # CUSTOM: your bot's own state variables! #
        ###########################################

    def on_scheduled_tweet(self):

        pass
        
    def on_mention(self, tweet, prefix):

        text = tweet.text

        print(text)

        print(type(text))

        percentage = {}

        tokens = wordpunct_tokenize(text)
        words = []

        for word in tokens:
            words.append(word.lower())

        for language in stopwords.fileids():
            stopwords_set = set(stopwords.words(language))
            words_set = set(words)
            most_common = words_set.intersection(stopwords_set)
            percentage[language] = len(most_common)

        most_probable = max(percentage, key=percentage.get)

        if most_probable == "danish":
            response = u'Hej! Taler du dansk?'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "dutch":
            response = u'Hi! Groeten uit Holland.'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "english":
            response = u'Hey! I speak some English.'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "finnish":
            response = u'Hei! Terveisiä Suomi.'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "french":
            response = u'Salut! Parlez-vous français?'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "german":
            response = u'Hey! Sprichst du deutsch?'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "hungarian":
            response = u'Hello! Beszélsz magyarul?'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "italian":
            response = u'Ciao! Saluti da Italia.'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "norwegian":
            response = u'Hei! Jeg snakker norsk.'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "portuguese":
            response = u'Olá! Você fala português?'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "russian":
            response = u'Привет! Привет из России.'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "spanish":
            response = u'¡Hola! Saludos desde España.'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "swedish":
            response = u'Hej! Talar du svenska?'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        if most_probable == "turkish":
            response = u'Merhaba! Türkçe biliyor musun?'
            prefixed = prefix + ' ' + response
            self.post_tweet(prefixed, reply_to=tweet)

        # print(percentage)

        # print(stopwords_set)
        # print(words_set)

        print("The language of your text is %s" % most_probable)

    def on_timeline(self, tweet, prefix):

        pass

if __name__ == '__main__':
    bot = Idiomaggio()
    bot.run()

100px100px

unfortunately I got stuck with the cmudict.dict - function of NLTK, while using this idea: http://h6o6.com/2013/03/using-python-and-the-nltk-to-find-haikus-in-the-public-twitter-stream/

so the result is not that complex… :(

grab the code at pastebin: http://pastebin.com/GvgyfDUU