diff --git a/LabPSA_3/networking.py b/LabPSA_3/networking.py index 8b13789..1186450 100644 --- a/LabPSA_3/networking.py +++ b/LabPSA_3/networking.py @@ -1 +1,178 @@ +import json +import matplotlib.pyplot as plt +import nltk +from nltk import TweetTokenizer + +tt = TweetTokenizer() +special_chars = "1234567890#@.=?\",”$%^&’*(…[]):!><" + +likes_per_id = {} +retweets_per_id = {} +filter_words = ["sure"] +month_and_noun = {'2020-10': 0, '2020-11': 0, '2020-12': 0,'2022-01': 0, '2022-02': 0, '2022-03': 0,'2022-11': 0} +words_tweet_tokenizer = [] +words_set = [] +nouns_counted = {} +word_count_not_case_sensitive = {} +proper_nouns_counted = {} +words_per_tweet = {} +special_words = [] +word_count_dict = {} +hashtags = [] + +# Handles the tweet json, separates words into needed categories +with open("tweets.json", "r", encoding="utf-8") as tweetJson: + tweetJsonData = json.load(tweetJson) + for tweet in tweetJsonData: + tempWords = tt.tokenize(tweet["text"]) + words_per_tweet[tweet["id"]] = tempWords + for word in tempWords: + if not any(c in special_chars for c in word) and len(word) > 1: + words_tweet_tokenizer.append(word) + if word not in word_count_not_case_sensitive: + word_count_not_case_sensitive[word] = 1 + else: + word_count_not_case_sensitive[word] += 1 + if word not in word_count_dict and word.lower() not in word_count_dict: + word_count_dict[word.lower()] = 1 + else: + word_count_dict[word.lower()] += 1 + if word[0] == '#': + hashtags.append(word) + if word not in words_set: + words_set.append(word) + else: + if len(word) > 1: + special_words.append(word) + +# Outputs counted words +print("============") +print("Top 10 words") +print("============") +word_count_dict = dict(sorted(word_count_dict.items(), key=lambda item: item[1], reverse=True)) +x = 1 +for i in word_count_dict: + if x <= 10: + print(i, " ", word_count_dict[i]) + x += 1 + +# This piece of shit code counts the number of a word per month +word = input("Write a word") +with open('tweets.json', 'r', encoding='utf-8') as tweet_json: + tweet_data = json.load(tweet_json) + for i in range(len(tweet_data)): + temp_msg = nltk.word_tokenize(tweet_data[i]["text"]) + temp_date = nltk.word_tokenize(tweet_data[i]["created_at"]) + for x in temp_msg: + if x == word or x.lower() == word: + month_and_noun[temp_date[0][:7]] += 1 +x = list(month_and_noun.keys()) +y = list(month_and_noun.values()) +plt.bar(x, y, color='maroon', width=0.7) +plt.show() + +# Counts nouns +for i in word_count_dict: + ans = nltk.pos_tag([i])[0][1] + if ans == 'NN' or ans == 'NNS' or ans == 'NNPS' or ans == 'NNP': + if i not in filter_words and not any(c in ["\'"] for c in i): + nouns_counted[i] = word_count_dict[i] +nouns_counted = dict(sorted(nouns_counted.items(), key=lambda item: item[1], reverse=True)) + +# Outputs counted nouns +print("============") +print("Top 10 nouns") +print("============") +x = 1 +for i in nouns_counted: + if x <= 10: + print(i, " ", nouns_counted[i]) + x += 1 + +# Counts proper nouns +for i in word_count_not_case_sensitive: + ans = nltk.pos_tag([i])[0][1] + if ans == 'NN' or ans == 'NNS' or ans == 'NNPS' or ans == 'NNP': + if i not in filter_words and not any(c in ["\'"] for c in i) and i[0].isupper(): + proper_nouns_counted[i] = word_count_not_case_sensitive[i] +proper_nouns_counted = dict(sorted(proper_nouns_counted.items(), key=lambda item: item[1], reverse=True)) + +# Outputs proper nouns +print("===================") +print("Top 10 proper nouns") +print("===================") +x = 1 +for i in proper_nouns_counted: + if x <= 10: + print(i, " ", proper_nouns_counted[i]) + x += 1 + + +# Counts likes and retweets for each tweet +with open('tweets.json', 'r', encoding='utf-8') as tweet_json: + tweet_data = json.load(tweet_json) + for tweet in tweet_data: + likes_per_id[tweet["id"]] = tweet["likes"] + retweets_per_id[tweet["id"]] = tweet["retweets"] + +# Counts popularity by formula +popularity_nouns = {} +for noun in nouns_counted: + normLikes = 0 + normRetweets = 0 + for id in words_per_tweet: + if noun in words_per_tweet[id]: + normLikes += likes_per_id[id] + normRetweets += retweets_per_id[id] + popularity_nouns[noun] = word_count_dict[noun] * (1.4 + normRetweets) * (1.2 + normLikes) +popularity_nouns = dict(sorted(popularity_nouns.items(), key=lambda item: item[1], reverse=True)) + +# Outputs popularity nouns +print("==========================") +print("Top 10 nouns by popularity") +print("==========================") +x = 1 +for i in popularity_nouns: + if x <= 10: + print(i, " ", popularity_nouns[i]) + x += 1 + +# Suggestion stuff +word_sliced_count = {} +word = input("Write word for suggestion") +for x in word_count_dict: + if x not in word_sliced_count: + word_sliced_count[x] = 0 +for x in word_count_dict: + if word == x[:len(word)] and word != x: + word_sliced_count[x] += word_count_dict[x] +word_sliced_count = dict(sorted(word_sliced_count.items(), key=lambda item: item[1], reverse=True)) +print("===============") +print("Top suggestions") +print("===============") +x = 1 +for i in word_sliced_count: + if x <= 10: + print(i, " ", word_sliced_count[i]) + x += 1 + +# Suggestion occurrences stuff +word = input("Write word for suggestion") +words_suggestion_counted = {} +for id in words_per_tweet: + for i in range(len(words_per_tweet[id]) - 2): + if words_per_tweet[id][i] == word: + if words_per_tweet[id][i+1] not in words_suggestion_counted and len(words_per_tweet[id][i+1]) > 1: + words_suggestion_counted[words_per_tweet[id][i+1]] = 1 + elif len(words_per_tweet[id][i+1]) > 1: + words_suggestion_counted[words_per_tweet[id][i + 1]] += 1 +words_suggestion_counted = dict(sorted(words_suggestion_counted.items(), key=lambda item: item[1], reverse=True)) +print("==========================") +print("Top suggestion occurrences") +print("==========================") +x = 1 +for i in words_suggestion_counted: + if x <= 5: + print(i, " ", words_suggestion_counted[i]) + x += 1