Monday, November 25, 2024

Scrape X.Com Part 1

I am interested in writing a python script to webscrape X (formerly Twitter) and pull videos and Gifs from my favorite posts. X does offer an API that does allow a Python script to do what I want. I even got Microsoft Copilot to generate a working script but using the API is really expensive. Therefore I found another article for that allows you to work around the API. See the following tutorial from Scrapfly - How to Scrape (Twitter) using Python (2024 Update). I also hope to use AI to put the downloaded pictures or video in folders for archiving. I wanted to do both in one script but I may have to do a separate script for training a model and organizing files into appropriate folders. Below is code that Microsoft Copilot genereated with a little editing from me.

# Prompt
# Write a python script that will download videos and pictures from the bookmarked posts in X and classify them
# according to Python, Books, Comics, and Porn for a given date, creating a folder for each category and putting
# the items in the same matching category and unfavorite the post if it has post with something to download

import os
from datetime import datetime

    import tweepy
    os.system("pip install tweepy")
    import tweepy

    import requests
    os.system("pip install requests")
    import requests

# Twitter API credentials
consumer_key = '***********'
consumer_secret = '&&&&&&&&&&&&&&'
access_token = '##########################'
access_token_secret = '@@@@@@@@@@@@@@@@@@@@@'

# Authenticate to Twitter
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

# Directory to save media
base_dir = 'downloaded_media'
if not os.path.exists(base_dir):

# Function to download media
def download_media(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as file:

# Function to classify media
def classify_media(tweet_text):
    if 'python' in tweet_text.lower():
        return 'Python'
    elif 'book' in tweet_text.lower():
        return 'Books'
    elif 'comic' in tweet_text.lower():
        return 'Comics'
    elif 'porn' in tweet_text.lower():
        return 'Porn'
        return 'Uncategorized'

# Get bookmarked tweets
bookmarked_tweets = api.get_favorites()  # This gets the liked tweets, adjust as needed for bookmarks

for tweet in bookmarked_tweets:
    tweet_date = tweet.created_at.strftime('%Y-%m-%d')
    tweet_text = tweet.text
    category = classify_media(tweet_text)
    category_dir = os.path.join(base_dir, category, tweet_date)
    if not os.path.exists(category_dir):
    media = tweet.entities.get('media', [])
    if media:
        for item in media:
            media_url = item['media_url']
            media_type = item['type']
            if media_type in ['photo', 'video']:
                filename = os.path.join(category_dir, os.path.basename(media_url))
                download_media(media_url, filename)
                print(f'Downloaded {media_type} from {media_url} to {category}/{tweet_date}')
        # Unfavorite the tweet
        print(f'Unfavorited tweet {}')

print('Download complete.')

No comments:

Post a Comment