library(here)
library(scales)
library(tidyverse)
library(readr)
library(here)
library(parameters)
theme_set(theme_bw())
knitr::opts_chunk$set(echo = TRUE)
birdwatch_notes <- read_tsv("C:\\Users\\ds3\\Desktop\\coursework\\week4\\project\\community-notes-2025-group-5\\vanessa\\birdwatch-public-data-2025-06-16-notes_filtered.tsv")
filtered_rating <- read_tsv("C:\\Users\\ds3\\Desktop\\coursework\\week4\\project\\community-notes-2025-group-5\\vanessa\\complete_filtered_ratings.tsv")
view(birdwatch_notes)
view(filtered_rating)
join_birdwatch_ratings <- left_join(birdwatch_notes, filtered_rating, by = "noteId")
view(join_birdwatch_ratings)
Figure 2: Number of users who responded “Yes” to the question, “Did you link to sources you believe most people would consider trustworthy?”.
birdwatch_notes %>%
count(classification, trustworthySources) %>%
ggplot(aes(x = classification, y = n, fill = factor(trustworthySources))) +
geom_bar(stat = "identity", position = "stack") +
scale_x_discrete(labels = c(
"MISINFORMED_OR_POTENTIALLY_MISLEADING" = "Misleading",
"NOT_MISLEADING" = "Not Misleading"
)) +
labs(fill = "Trustworthy Source")
Figure 3: Number of users who checked the answer options in response to the question, “Why do you believe this tweet may be misleading?”.
birdwatch_notes %>%
summarize(across(c(9:15),sum)) %>%
pivot_longer(everything(), names_to = "name", values_to = "total_count") %>%
ggplot(aes(x= reorder(name, total_count), y = total_count)) + geom_bar(stat = "identity", fill = "dark red") + coord_flip() +
scale_x_discrete(labels = c(
"misleadingFactualError" = "Factual Error",
"misleadingMissingImportantContext" = "Missing Important Context",
"misleadingUnverifiedClaimAsFact" = "Unverified Claim as Fact",
"misleadingOutdatedInformation" = "Outdated Information",
"misleadingSatire" = "Satire",
"misleadingOther" = "Other",
"misleadingManipulatedMedia" = "Manipulated Media"
)) +
xlab("Misleading Type") +
ylab("Number of Bird Watch Notes")
Figure 4: Number of users who checked the answer options in response to the question, “Why do you believe this tweet is not misleading?”.
birdwatch_notes %>%
summarize(across(c(16:20),sum)) %>%
pivot_longer(everything(), names_to = "name", values_to = "total_count") %>%
ggplot(aes(x= reorder(name, total_count), y = total_count)) + geom_bar(stat = "identity", fill = "dark blue") + coord_flip() +
scale_x_discrete(labels = c(
"notMisleadingFactuallyCorrect" = "Factually Correct",
"notMisleadingOutdatedButNotWhenWritten" = "Outdates but Not When Written",
"notMisleadingClearlySatire" = "Clearly Satire",
"notMisleadingOther" = "Other",
"notMisleadingPersonalOpinion" = "Personal Opinion"
)) +
xlab("Not Misleading Type") +
ylab("Number of Bird Watch Notes")
Figure 5c: The word count in text explanation of Birdwatch Notes.
birdwatch_notes %>%
mutate(word_count = str_count(summary, "\\w+")) %>%
arrange(word_count) %>%
group_by(classification, word_count) %>% # group by both
summarise(num_notes = n(), .groups = "drop") %>%
arrange(classification, word_count) %>%
group_by(classification) %>%
mutate(
cumulative_tot_notes = cumsum(num_notes),
frac_notes = 1 - (cumulative_tot_notes / sum(num_notes))
) %>%
ggplot(aes(x = word_count, y = 100 * frac_notes, color = classification)) +
geom_line(size = 1) +
labs(
x = "Word Count",
y = "CCDF"
) +
scale_y_log10(limit = c(0.01, 100),label = comma)
Figure 7a: The CCDFs for helpfulness ratio.
join_birdwatch_ratings %>% filter(!is.na(helpful)) %>%
group_by(noteId, classification) %>%
summarise(total_votes = n(), helpful_votes = sum(helpful == 1), ratio_helpful = helpful_votes / total_votes, .groups = "drop")%>%
count(classification, ratio_helpful) %>%
arrange(ratio_helpful) %>%
group_by(classification) %>%
mutate(
cumulative_tot_votes = cumsum(n),
frac_votes = 1 - (cumulative_tot_votes / sum(n))
) %>%
ggplot(aes(x = ratio_helpful , y = frac_votes, color = classification)) +
geom_line() +
labs(
x = "Ratio Helpful",
y = "CCDF(%)"
) +
scale_y_log10(limit = c(0.01, 100),label = comma)
Figure 7b: The CCDFs for total votes.
join_birdwatch_ratings %>% group_by(classification, noteId) %>%
count() %>%
arrange(n) %>%
group_by(classification) %>%
mutate(cumulative_tot_votes = cumsum(n), frac_votes = 1 - (cumulative_tot_votes / sum(n))) %>%
ggplot(aes(x = n , y = 100 * frac_votes, color = classification)) +
geom_line() +
labs(
x = "Word Count",
y = "CCDF"
) +
scale_y_log10(limit = c(0.01, 100),label = comma)
Figure 8: Number of rating per checkbox users checked in reponse to the question, “What about this note was helpful to you?”.
filtered_rating %>%
summarize(across(c(10:17),sum)) %>%
pivot_longer(everything(), names_to = "name", values_to = "total_count") %>%
ggplot(aes(x= reorder(name, total_count), y = total_count)) + geom_bar(stat = "identity", fill = "dark blue") + coord_flip() +
scale_x_discrete(labels = c(
"helpfulClear" = "Clear",
"helpfulGoodSources" = "Good Sources",
"helpfulInformative" = "Informative",
"helpfulEmpathetic" = "Empathetic",
"helpfulAddressesClaim" = "Addresses Claim",
"helpfulUniqueContext" = "Unique Context",
"helpfulImportantContext" = "Important Context",
"helpfulOther" = " Other"
)) +
xlab("Helpfulness Type") +
ylab("Number of Ratings")
Figure 9: Number of rating per checkbox users checked in reponse to the question, “Help us understand why this note was unhelpful?”.
filtered_rating %>%
summarize(across(c(19:29),sum)) %>%
pivot_longer(everything(), names_to = "name", values_to = "total_count") %>%
ggplot(aes(x= reorder(name, total_count), y = total_count)) + geom_bar(stat = "identity", fill = "dark red") + coord_flip() +
scale_x_discrete(labels = c(
"notHelpfulMissingKeyPoints" = "Missing Key Points",
"notHelpfulSourcesMissingOrUnreliable" = " Sources Missing or Unreliable",
"notHelpfulOpinionSpeculationOrBias" = " Opinion Speculation or Bias",
"notHelpfulArgumentativeOrBias" = " Argumentative or Inflammatory",
"notHelpfulIncorrect" = "Incorrect",
"notHelpfulOther" = "Other",
"notHelpfulOffTopic" = "Off Topic",
"notHelpfulHardToUnderstand" = "Hard to Understand",
"notHelpfulIrrelevantSources" = "Irrelevant Sources",
"notHelpfulSpamHarassmentOrAbuse" = "Spam Harassment or Abuse",
"notHelpfulOutdated" = "Outdated"
))+
xlab("Not Helpfulness Type") +
ylab("Number of Ratings")
load("C:/Users/ds3/Desktop/coursework/week4/project/community-notes-2025-group-5/vanessa/data/source_tweets.Rdata", ournew_env <- new.env())
source_tweets <- ournew_env[["."]]
View(source_tweets)
birdwatch_notes_wordcount <- birdwatch_notes %>%
mutate(word_count = str_count(summary, "\\w+")) %>%
arrange(desc(word_count))
view(birdwatch_notes_wordcount)
join_notes_rating_tweets <- filtered_rating %>% select(noteId, helpful)
view(join_notes_rating_tweets)
join_notes_rating_tweets <- source_tweets %>%
inner_join(birdwatch_notes_wordcount, by = "noteId") %>%
inner_join(join_notes_rating_tweets, by = "noteId")
view(join_notes_rating_tweets)
# Ensure classification is a factor or numeric and not introducing NAs
join_notes_rating_tweets <- join_notes_rating_tweets %>%
mutate(
classification = as.factor(classification),
account_age = as.integer(interval(source_account_created_at, Sys.Date()) / years(1))
)
view(join_notes_rating_tweets)
join_notes_rating_tweets %>% mutate(across(c(source_followers_count, source_friends_count, word_count, account_age), scale))
## # A tibble: 35,090 × 39
## noteId tweetId.x source_user_id source_created_at source_is_quote
## <dbl> <dbl> <dbl> <dttm> <lgl>
## 1 1.36e18 1.35e18 39344374 2021-01-22 23:06:44 FALSE
## 2 1.36e18 1.35e18 39344374 2021-01-22 23:06:44 FALSE
## 3 1.36e18 1.35e18 39344374 2021-01-22 23:06:44 FALSE
## 4 1.36e18 1.35e18 39344374 2021-01-22 23:06:44 FALSE
## 5 1.38e18 1.38e18 NA NA NA
## 6 1.38e18 1.38e18 NA NA NA
## 7 1.42e18 1.42e18 3551446154 2021-07-16 18:34:50 FALSE
## 8 1.42e18 1.42e18 3551446154 2021-07-16 18:34:50 FALSE
## 9 1.42e18 1.42e18 3551446154 2021-07-16 18:34:50 FALSE
## 10 1.42e18 1.42e18 3551446154 2021-07-16 18:34:50 FALSE
## # ℹ 35,080 more rows
## # ℹ 34 more variables: source_is_retweet <lgl>, source_favorite_count <dbl>,
## # source_retweet_count <dbl>, source_quote_count <lgl>,
## # source_reply_count <lgl>, source_followers_count <dbl[,1]>,
## # source_verified <lgl>, source_friends_count <dbl[,1]>,
## # source_account_created_at <dttm>, noteAuthorParticipantId <chr>,
## # createdAtMillis <dbl>, tweetId.y <dbl>, classification <fct>, …
library(broom)
library(tidyverse)
model <- glm(helpful~ classification + trustworthySources + word_count + account_age + source_friends_count + source_followers_count + source_verified,data = join_notes_rating_tweets, family = binomial)
model <- tidy(model, conf.int = TRUE, conf.level = 0.99)
summary(model)
## term estimate std.error statistic
## Length:8 Min. :-5.995e-01 Min. :1.000e-09 Min. :-13.3728
## Class :character 1st Qu.:-4.375e-01 1st Qu.:4.958e-04 1st Qu.:-11.4020
## Mode :character Median :-9.200e-07 Median :1.579e-02 Median : -1.5086
## Mean :-1.444e-01 Mean :1.981e-02 Mean : -0.8904
## 3rd Qu.: 1.248e-02 3rd Qu.:3.694e-02 3rd Qu.: 9.6509
## Max. : 3.371e-01 Max. :5.120e-02 Max. : 11.7961
## p.value conf.low conf.high
## Min. :0.000e+00 Min. :-7.315e-01 Min. :-4.677e-01
## 1st Qu.:0.000e+00 1st Qu.:-5.328e-01 1st Qu.:-3.425e-01
## Median :0.000e+00 Median :-1.550e-06 Median :-3.100e-07
## Mean :5.397e-02 Mean :-1.955e-01 Mean :-9.342e-02
## 3rd Qu.:3.566e-05 3rd Qu.: 9.268e-03 3rd Qu.: 1.570e-02
## Max. :4.316e-01 Max. : 2.635e-01 Max. : 4.108e-01
join_notes_rating_tweets_pred <- join_notes_rating_tweets
view(join_notes_rating_tweets_pred)
print(model)
## # A tibble: 8 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -5.99e- 1 5.12e-2 -11.7 1.15e-31 -7.32e-1 -4.68e-1
## 2 classificationNOT_M… -5.16e- 1 3.86e-2 -13.4 8.71e-41 -6.16e-1 -4.17e-1
## 3 trustworthySources 3.37e- 1 2.86e-2 11.8 4.09e-32 2.64e-1 4.11e-1
## 4 word_count 7.55e- 3 6.61e-4 11.4 3.43e-30 5.85e-3 9.25e-3
## 5 account_age 2.73e- 2 3.01e-3 9.06 1.28e-19 1.95e-2 3.50e-2
## 6 source_friends_count -1.83e- 6 4.82e-7 -3.80 1.43e- 4 -3.10e-6 -6.14e-7
## 7 source_followers_co… 8.17e-10 1.04e-9 0.786 4.32e- 1 -1.87e-9 3.49e-9
## 8 source_verifiedTRUE -4.11e- 1 3.64e-2 -11.3 1.32e-29 -5.05e-1 -3.18e-1
ggplot(model, aes(x = estimate, y = term)) +
geom_point() +
geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) +
coord_flip() +
scale_y_discrete(labels = c(
"classificationNOT_MISLEADING" = "Misleading",
"trustworthySources" = "Trustworthy Sources",
"word_count" = "Word Count",
"account_age" = "Account Age",
"source_friends_count" = "Friends Count",
"source_followers_count" = "Followers Count",
"source_verifiedTRUE" = "Verified"
)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
library(MASS)
model_data <- join_notes_rating_tweets_pred %>% group_by(noteId) %>% mutate(helpful_votes = sum(helpful, na.rm = TRUE),
total_votes = n(), helpfulness_ratio = helpful_votes / total_votes) %>%
ungroup() %>% filter(!is.na(helpfulness_ratio),is.finite(helpfulness_ratio))
view(model_data)
lm_helpfullness_ratio <- lm( helpfulness_ratio~ classification + trustworthySources + word_count +
account_age + source_friends_count + source_followers_count + source_verified,
data = model_data )
helpfullness_ratio_summary <- tidy(lm_helpfullness_ratio, conf.int = TRUE, conf.level = 0.99)
print(helpfullness_ratio_summary)
## # A tibble: 8 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.56e- 1 9.15e- 3 38.9 1.98e-323 3.33e- 1 3.80e- 1
## 2 classificationNOT… -1.21e- 1 6.70e- 3 -18.1 1.71e- 72 -1.38e- 1 -1.04e- 1
## 3 trustworthySources 8.14e- 2 5.12e- 3 15.9 8.73e- 57 6.82e- 2 9.46e- 2
## 4 word_count 1.82e- 3 1.18e- 4 15.4 4.38e- 53 1.51e- 3 2.12e- 3
## 5 account_age 6.51e- 3 5.37e- 4 12.1 1.08e- 33 5.12e- 3 7.89e- 3
## 6 source_friends_co… -4.29e- 7 8.21e- 8 -5.22 1.79e- 7 -6.40e- 7 -2.17e- 7
## 7 source_followers_… 2.15e-10 1.85e-10 1.16 2.45e- 1 -2.61e-10 6.92e-10
## 8 source_verifiedTR… -9.95e- 2 6.54e- 3 -15.2 3.70e- 52 -1.16e- 1 -8.27e- 2
model_data <- model_data %>%
mutate(source_verified = as.numeric(source_verified),
classification = as.numeric(classification),
trustworthySources = as.numeric(trustworthySources)) %>%
mutate(across(c(classification, trustworthySources, word_count, source_friends_count, source_followers_count, source_verified, account_age), scale, .names = "{.col}_z"))
model_votes <- glm.nb(total_votes ~ classification_z + trustworthySources_z + word_count_z + source_friends_count_z +
source_followers_count_z + source_verified_z + account_age_z, data = model_data)
model_summary_votes <- tidy(model_votes, conf.int = TRUE, conf.level = 0.99)
print(model_summary_votes)
## # A tibble: 8 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.07 0.00564 544. 0 3.05 3.08
## 2 classification_z 0.0982 0.00558 17.6 2.61e- 69 0.0831 0.113
## 3 trustworthySources_z 0.0302 0.00629 4.80 1.55e- 6 0.0136 0.0469
## 4 word_count_z 0.0272 0.00635 4.28 1.84e- 5 0.00998 0.0444
## 5 source_friends_coun… 0.0240 0.00569 4.22 2.39e- 5 0.00627 0.0426
## 6 source_followers_co… 0.0103 0.00581 1.77 7.69e- 2 -0.00696 0.0278
## 7 source_verified_z 0.308 0.00602 51.1 0 0.293 0.323
## 8 account_age_z -0.135 0.00588 -22.9 2.07e-116 -0.150 -0.119
helpfullness_ratio_summary <- helpfullness_ratio_summary %>% mutate(DV = "Helpfulness Ratio")
model_summary_votes <- model_summary_votes |> mutate(DV = "Total Votes")
combined_summary <- bind_rows(helpfullness_ratio_summary, model_summary_votes)
combined_summary <- combined_summary |>
filter(term != "(Intercept)") |>
mutate(
term = case_when(
grepl("classification", term) ~ "Misleading",
grepl("trustworthySources", term) ~ "Trustworthy Sources",
grepl("word_count", term) ~ "Word Count",
grepl("source_friends_count", term) ~ "Source Friends Count",
grepl("source_followers_count", term) ~ "Source Followers Count",
grepl("source_verified", term) ~ "Source Verified",
grepl("account_age", term) ~ "Source Account Age",
TRUE ~ term
)
)
ggplot(combined_summary, aes(x = estimate, y = term, color = DV)) +
geom_point(position = position_dodge(width = 0.5)) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2, position = position_dodge(width = 0.5)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
labs(title = "Figure 10: Regression Results", x = "Standardized Coefficient Estimate", y = NULL, color = "Dependent Variable") +
theme_minimal() +
coord_flip() +
scale_y_discrete(labels = c(
"classificationNOT_MISLEADING" = "Misleading",
"trustworthySources" = "Trustworthy Sources",
"word_count" = "Word Count",
"account_age" = "Account Age",
"source_friends_count" = "Friends Count",
"source_followers_count" = "Followers Count",
"source_verifiedTRUE" = "Verified"
)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))