Monday, November 25, 2024

Scrape X.Com Part 1


I am interested in writing a python script to webscrape X (formerly Twitter) and pull videos and Gifs from my favorite posts. X does offer an API that does allow a Python script to do what I want. I even got Microsoft Copilot to generate a working script but using the API is really expensive. Therefore I found another article for that allows you to work around the API. See the following tutorial from Scrapfly - How to Scrape X.com (Twitter) using Python (2024 Update). I also hope to use AI to put the downloaded pictures or video in folders for archiving. I wanted to do both in one script but I may have to do a separate script for training a model and organizing files into appropriate folders. Below is code that Microsoft Copilot genereated with a little editing from me.


# Prompt
# Write a python script that will download videos and pictures from the bookmarked posts in X and classify them
# according to Python, Books, Comics, and Porn for a given date, creating a folder for each category and putting
# the items in the same matching category and unfavorite the post if it has post with something to download
#

import os
from datetime import datetime

try:
    import tweepy
except:
    os.system("pip install tweepy")
    import tweepy

try:
    import requests
except:
    os.system("pip install requests")
    import requests

# Twitter API credentials
consumer_key = '***********'
consumer_secret = '&&&&&&&&&&&&&&'
access_token = '##########################'
access_token_secret = '@@@@@@@@@@@@@@@@@@@@@'

# Authenticate to Twitter
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

# Directory to save media
base_dir = 'downloaded_media'
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Function to download media
def download_media(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.content)

# Function to classify media
def classify_media(tweet_text):
    if 'python' in tweet_text.lower():
        return 'Python'
    elif 'book' in tweet_text.lower():
        return 'Books'
    elif 'comic' in tweet_text.lower():
        return 'Comics'
    elif 'porn' in tweet_text.lower():
        return 'Porn'
    else:
        return 'Uncategorized'

# Get bookmarked tweets
bookmarked_tweets = api.get_favorites()  # This gets the liked tweets, adjust as needed for bookmarks

for tweet in bookmarked_tweets:
    tweet_date = tweet.created_at.strftime('%Y-%m-%d')
    tweet_text = tweet.text
    category = classify_media(tweet_text)
    category_dir = os.path.join(base_dir, category, tweet_date)
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)
   
    media = tweet.entities.get('media', [])
    if media:
        for item in media:
            media_url = item['media_url']
            media_type = item['type']
            if media_type in ['photo', 'video']:
                filename = os.path.join(category_dir, os.path.basename(media_url))
                download_media(media_url, filename)
                print(f'Downloaded {media_type} from {media_url} to {category}/{tweet_date}')
       
        # Unfavorite the tweet
        api.destroy_favorite(tweet.id)
        print(f'Unfavorited tweet {tweet.id}')

print('Download complete.')

No comments:

Post a Comment