Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion src/scrapers/daily_sun_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
from datetime import datetime, timedelta
from dotenv import load_dotenv
from ..services import ArticleService
from ..utils.constants import ARTICLE_IMG_TAG
import logging
from bs4 import BeautifulSoup
import base64

load_dotenv()

Expand Down Expand Up @@ -36,16 +39,32 @@ def fetch_news():
)
article_url = f"https://cornellsun.com/article/{article['slug']}"

article_image = None
try:
response = requests.get(
article_url,
headers={
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}
)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
img_tag = soup.select_one(ARTICLE_IMG_TAG)
if img_tag and img_tag.get('src'):
article_image=img_tag.get('src')
except Exception as e:
logging.error(f"Error fetching news: {str(e)}")
article_doc = {
"title": article["headline"],
"image": article["dominantMedia"]["title"] if article["dominantMedia"] else None,
"image": article_image,
"sports_type": sports_type,
"published_at": published_at,
"url": article_url,
"slug": article["slug"],
"created_at": datetime.now()
}
articles_to_store.append(article_doc)


if articles_to_store:
ArticleService.create_articles_bulk(articles_to_store)
Expand Down
2 changes: 1 addition & 1 deletion src/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,4 @@
# The maximum number of videos to retrieve
VIDEO_LIMIT = 20


ARTICLE_IMG_TAG = ".dom-art-container img"