library(here)
library(scales)
library(tidyverse)
library(readr)
library(here)
library(parameters)

theme_set(theme_bw())

knitr::opts_chunk$set(echo = TRUE)

Load community notes and rating data

birdwatch_notes <- read_tsv("C:\\Users\\ds3\\Desktop\\coursework\\week4\\project\\community-notes-2025-group-5\\vanessa\\birdwatch-public-data-2025-06-16-notes_filtered.tsv")
filtered_rating <- read_tsv("C:\\Users\\ds3\\Desktop\\coursework\\week4\\project\\community-notes-2025-group-5\\vanessa\\complete_filtered_ratings.tsv")
view(birdwatch_notes)
view(filtered_rating)

Merge Data Frames

join_birdwatch_ratings <- left_join(birdwatch_notes, filtered_rating, by = "noteId")

view(join_birdwatch_ratings)

Figure 2 - Trustworthy Sources

Figure 2: Number of users who responded “Yes” to the question, “Did you link to sources you believe most people would consider trustworthy?”.

birdwatch_notes %>%
  count(classification, trustworthySources) %>%
  ggplot(aes(x = classification, y = n, fill = factor(trustworthySources))) +
  geom_bar(stat = "identity", position = "stack") +
  scale_x_discrete(labels = c(
    "MISINFORMED_OR_POTENTIALLY_MISLEADING" = "Misleading",
    "NOT_MISLEADING" = "Not Misleading"
    )) +
  labs(fill = "Trustworthy Source")

Figure 3 - Number of Birdwatch Noted Per Checkbox (Misleading)

Figure 3: Number of users who checked the answer options in response to the question, “Why do you believe this tweet may be misleading?”.

birdwatch_notes %>%
summarize(across(c(9:15),sum)) %>% 
pivot_longer(everything(), names_to = "name", values_to = "total_count") %>% 
ggplot(aes(x= reorder(name, total_count), y = total_count)) + geom_bar(stat = "identity",  fill = "dark red") + coord_flip() +
scale_x_discrete(labels = c(
    "misleadingFactualError" = "Factual Error",
    "misleadingMissingImportantContext" = "Missing Important Context",
    "misleadingUnverifiedClaimAsFact" = "Unverified Claim as Fact",
    "misleadingOutdatedInformation" = "Outdated Information",
    "misleadingSatire" = "Satire",
    "misleadingOther" = "Other",
    "misleadingManipulatedMedia" = "Manipulated Media"
    )) +
xlab("Misleading Type") +
ylab("Number of Bird Watch Notes")

Figure 4 - Number of Birdwatch Noted Per Checkbox ( Not Misleading)

Figure 4: Number of users who checked the answer options in response to the question, “Why do you believe this tweet is not misleading?”.

birdwatch_notes %>%
summarize(across(c(16:20),sum)) %>% 
pivot_longer(everything(), names_to = "name", values_to = "total_count") %>% 
ggplot(aes(x= reorder(name, total_count), y = total_count)) + geom_bar(stat = "identity",  fill = "dark blue") + coord_flip() +
scale_x_discrete(labels = c(
    "notMisleadingFactuallyCorrect" = "Factually Correct",
    "notMisleadingOutdatedButNotWhenWritten" = "Outdates but Not When Written",
    "notMisleadingClearlySatire" = "Clearly Satire",
    "notMisleadingOther" = "Other",
    "notMisleadingPersonalOpinion" = "Personal Opinion"
    )) +
xlab("Not Misleading Type") +
ylab("Number of Bird Watch Notes")

Figure 5c - Misleading Status

Figure 5c: The word count in text explanation of Birdwatch Notes.

birdwatch_notes %>%
  mutate(word_count = str_count(summary, "\\w+")) %>%
  arrange(word_count) %>%
  group_by(classification, word_count) %>%   # group by both
  summarise(num_notes = n(), .groups = "drop") %>%
  arrange(classification, word_count) %>%
  group_by(classification) %>%
  mutate(
    cumulative_tot_notes = cumsum(num_notes),
    frac_notes = 1 - (cumulative_tot_notes / sum(num_notes))
  ) %>%
  ggplot(aes(x = word_count, y = 100 * frac_notes, color = classification)) +
  geom_line(size = 1) +
    labs(
    x = "Word Count",
    y = "CCDF"
  ) +
  scale_y_log10(limit = c(0.01, 100),label = comma)

Figure 7a - Helpfulness Ratio

Figure 7a: The CCDFs for helpfulness ratio.

join_birdwatch_ratings %>% filter(!is.na(helpful)) %>%
group_by(noteId, classification) %>% 
  summarise(total_votes = n(), helpful_votes = sum(helpful == 1), ratio_helpful = helpful_votes / total_votes, .groups = "drop")%>%
    count(classification, ratio_helpful) %>%
    arrange(ratio_helpful) %>%
    group_by(classification) %>%
  mutate(
    cumulative_tot_votes = cumsum(n),
    frac_votes = 1 - (cumulative_tot_votes / sum(n))
  ) %>%
  ggplot(aes(x = ratio_helpful , y = frac_votes, color = classification)) +
  geom_line() +
  labs(
  x = "Ratio Helpful",
  y = "CCDF(%)"
  ) +
   scale_y_log10(limit = c(0.01, 100),label = comma)

Figure 7b - Total Votes

Figure 7b: The CCDFs for total votes.

join_birdwatch_ratings %>% group_by(classification, noteId) %>% 
  count() %>% 
   arrange(n) %>%
  group_by(classification) %>%
  mutate(cumulative_tot_votes = cumsum(n), frac_votes = 1 - (cumulative_tot_votes / sum(n))) %>%
  ggplot(aes(x = n , y = 100 * frac_votes, color = classification)) +
  geom_line() +
  labs(
  x = "Word Count",
  y = "CCDF"
  ) +
   scale_y_log10(limit = c(0.01, 100),label = comma)

Figure 8 - Number of Rating Per Checkbox (Helpful)

Figure 8: Number of rating per checkbox users checked in reponse to the question, “What about this note was helpful to you?”.

filtered_rating %>%
summarize(across(c(10:17),sum)) %>% 
pivot_longer(everything(), names_to = "name", values_to = "total_count") %>% 
ggplot(aes(x= reorder(name, total_count), y = total_count)) + geom_bar(stat = "identity",  fill = "dark blue") + coord_flip() +
scale_x_discrete(labels = c( 
  "helpfulClear" = "Clear",
  "helpfulGoodSources" = "Good Sources",
  "helpfulInformative" = "Informative",
  "helpfulEmpathetic" = "Empathetic",
  "helpfulAddressesClaim" = "Addresses Claim",
  "helpfulUniqueContext" = "Unique Context",
  "helpfulImportantContext" = "Important Context",
  "helpfulOther" = " Other"
)) +
xlab("Helpfulness Type") +
ylab("Number of Ratings")

Figure 9 - Number of Ratings Per Checkbox (Unhelpful)

Figure 9: Number of rating per checkbox users checked in reponse to the question, “Help us understand why this note was unhelpful?”.

filtered_rating %>%
summarize(across(c(19:29),sum)) %>% 
pivot_longer(everything(), names_to = "name", values_to = "total_count") %>% 
ggplot(aes(x= reorder(name, total_count), y = total_count)) + geom_bar(stat = "identity",  fill = "dark red") + coord_flip() +
scale_x_discrete(labels = c( 
  "notHelpfulMissingKeyPoints" = "Missing Key Points",
  "notHelpfulSourcesMissingOrUnreliable" = " Sources Missing or Unreliable",
  "notHelpfulOpinionSpeculationOrBias" = " Opinion Speculation or Bias",
  "notHelpfulArgumentativeOrBias" = " Argumentative or Inflammatory",
  "notHelpfulIncorrect" = "Incorrect",
  "notHelpfulOther" = "Other",
  "notHelpfulOffTopic" = "Off Topic",
  "notHelpfulHardToUnderstand" = "Hard to Understand",
  "notHelpfulIrrelevantSources" = "Irrelevant Sources",
  "notHelpfulSpamHarassmentOrAbuse" = "Spam Harassment or Abuse",
  "notHelpfulOutdated" = "Outdated"
))+
xlab("Not Helpfulness Type") +
ylab("Number of Ratings")

Logistic Regression

load("C:/Users/ds3/Desktop/coursework/week4/project/community-notes-2025-group-5/vanessa/data/source_tweets.Rdata", ournew_env <- new.env())
source_tweets <- ournew_env[["."]]
View(source_tweets)

birdwatch_notes_wordcount <- birdwatch_notes %>%
  mutate(word_count = str_count(summary, "\\w+")) %>%
  arrange(desc(word_count))
  view(birdwatch_notes_wordcount)

join_notes_rating_tweets <- filtered_rating %>% select(noteId, helpful)
view(join_notes_rating_tweets)

join_notes_rating_tweets <- source_tweets %>%
  inner_join(birdwatch_notes_wordcount, by = "noteId") %>%
  inner_join(join_notes_rating_tweets, by = "noteId")

view(join_notes_rating_tweets)

# Ensure classification is a factor or numeric and not introducing NAs
join_notes_rating_tweets <- join_notes_rating_tweets %>%
  mutate(
    classification = as.factor(classification),
    account_age = as.integer(interval(source_account_created_at, Sys.Date()) / years(1))
  )
view(join_notes_rating_tweets)

join_notes_rating_tweets %>% mutate(across(c(source_followers_count, source_friends_count, word_count, account_age), scale))
## # A tibble: 35,090 × 39
##     noteId tweetId.x source_user_id source_created_at   source_is_quote
##      <dbl>     <dbl>          <dbl> <dttm>              <lgl>          
##  1 1.36e18   1.35e18       39344374 2021-01-22 23:06:44 FALSE          
##  2 1.36e18   1.35e18       39344374 2021-01-22 23:06:44 FALSE          
##  3 1.36e18   1.35e18       39344374 2021-01-22 23:06:44 FALSE          
##  4 1.36e18   1.35e18       39344374 2021-01-22 23:06:44 FALSE          
##  5 1.38e18   1.38e18             NA NA                  NA             
##  6 1.38e18   1.38e18             NA NA                  NA             
##  7 1.42e18   1.42e18     3551446154 2021-07-16 18:34:50 FALSE          
##  8 1.42e18   1.42e18     3551446154 2021-07-16 18:34:50 FALSE          
##  9 1.42e18   1.42e18     3551446154 2021-07-16 18:34:50 FALSE          
## 10 1.42e18   1.42e18     3551446154 2021-07-16 18:34:50 FALSE          
## # ℹ 35,080 more rows
## # ℹ 34 more variables: source_is_retweet <lgl>, source_favorite_count <dbl>,
## #   source_retweet_count <dbl>, source_quote_count <lgl>,
## #   source_reply_count <lgl>, source_followers_count <dbl[,1]>,
## #   source_verified <lgl>, source_friends_count <dbl[,1]>,
## #   source_account_created_at <dttm>, noteAuthorParticipantId <chr>,
## #   createdAtMillis <dbl>, tweetId.y <dbl>, classification <fct>, …
library(broom)
library(tidyverse)

model <- glm(helpful~ classification + trustworthySources + word_count + account_age + source_friends_count + source_followers_count + source_verified,data = join_notes_rating_tweets, family = binomial)

model <- tidy(model, conf.int = TRUE, conf.level = 0.99)
summary(model)
##      term              estimate            std.error           statistic       
##  Length:8           Min.   :-5.995e-01   Min.   :1.000e-09   Min.   :-13.3728  
##  Class :character   1st Qu.:-4.375e-01   1st Qu.:4.958e-04   1st Qu.:-11.4020  
##  Mode  :character   Median :-9.200e-07   Median :1.579e-02   Median : -1.5086  
##                     Mean   :-1.444e-01   Mean   :1.981e-02   Mean   : -0.8904  
##                     3rd Qu.: 1.248e-02   3rd Qu.:3.694e-02   3rd Qu.:  9.6509  
##                     Max.   : 3.371e-01   Max.   :5.120e-02   Max.   : 11.7961  
##     p.value             conf.low            conf.high         
##  Min.   :0.000e+00   Min.   :-7.315e-01   Min.   :-4.677e-01  
##  1st Qu.:0.000e+00   1st Qu.:-5.328e-01   1st Qu.:-3.425e-01  
##  Median :0.000e+00   Median :-1.550e-06   Median :-3.100e-07  
##  Mean   :5.397e-02   Mean   :-1.955e-01   Mean   :-9.342e-02  
##  3rd Qu.:3.566e-05   3rd Qu.: 9.268e-03   3rd Qu.: 1.570e-02  
##  Max.   :4.316e-01   Max.   : 2.635e-01   Max.   : 4.108e-01
join_notes_rating_tweets_pred <- join_notes_rating_tweets
  view(join_notes_rating_tweets_pred)

print(model)
## # A tibble: 8 × 7
##   term                  estimate std.error statistic  p.value conf.low conf.high
##   <chr>                    <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)          -5.99e- 1   5.12e-2   -11.7   1.15e-31 -7.32e-1  -4.68e-1
## 2 classificationNOT_M… -5.16e- 1   3.86e-2   -13.4   8.71e-41 -6.16e-1  -4.17e-1
## 3 trustworthySources    3.37e- 1   2.86e-2    11.8   4.09e-32  2.64e-1   4.11e-1
## 4 word_count            7.55e- 3   6.61e-4    11.4   3.43e-30  5.85e-3   9.25e-3
## 5 account_age           2.73e- 2   3.01e-3     9.06  1.28e-19  1.95e-2   3.50e-2
## 6 source_friends_count -1.83e- 6   4.82e-7    -3.80  1.43e- 4 -3.10e-6  -6.14e-7
## 7 source_followers_co…  8.17e-10   1.04e-9     0.786 4.32e- 1 -1.87e-9   3.49e-9
## 8 source_verifiedTRUE  -4.11e- 1   3.64e-2   -11.3   1.32e-29 -5.05e-1  -3.18e-1
  ggplot(model, aes(x = estimate, y = term)) +
    geom_point() +
    geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) +
    coord_flip() +
    scale_y_discrete(labels = c(
      "classificationNOT_MISLEADING" = "Misleading",
      "trustworthySources" = "Trustworthy Sources",
      "word_count" = "Word Count",
      "account_age" = "Account Age",
      "source_friends_count" = "Friends Count",
      "source_followers_count" = "Followers Count",
      "source_verifiedTRUE" = "Verified"
    )) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

library(MASS)

model_data <- join_notes_rating_tweets_pred %>% group_by(noteId) %>% mutate(helpful_votes = sum(helpful, na.rm = TRUE),
  total_votes = n(), helpfulness_ratio = helpful_votes / total_votes) %>%
  ungroup() %>% filter(!is.na(helpfulness_ratio),is.finite(helpfulness_ratio))
view(model_data)

lm_helpfullness_ratio <- lm( helpfulness_ratio~ classification + trustworthySources + word_count + 
                       account_age + source_friends_count + source_followers_count + source_verified,
                     data = model_data )

helpfullness_ratio_summary <- tidy(lm_helpfullness_ratio, conf.int = TRUE, conf.level = 0.99)
print(helpfullness_ratio_summary)
## # A tibble: 8 × 7
##   term                estimate std.error statistic   p.value  conf.low conf.high
##   <chr>                  <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)         3.56e- 1  9.15e- 3     38.9  1.98e-323  3.33e- 1  3.80e- 1
## 2 classificationNOT… -1.21e- 1  6.70e- 3    -18.1  1.71e- 72 -1.38e- 1 -1.04e- 1
## 3 trustworthySources  8.14e- 2  5.12e- 3     15.9  8.73e- 57  6.82e- 2  9.46e- 2
## 4 word_count          1.82e- 3  1.18e- 4     15.4  4.38e- 53  1.51e- 3  2.12e- 3
## 5 account_age         6.51e- 3  5.37e- 4     12.1  1.08e- 33  5.12e- 3  7.89e- 3
## 6 source_friends_co… -4.29e- 7  8.21e- 8     -5.22 1.79e-  7 -6.40e- 7 -2.17e- 7
## 7 source_followers_…  2.15e-10  1.85e-10      1.16 2.45e-  1 -2.61e-10  6.92e-10
## 8 source_verifiedTR… -9.95e- 2  6.54e- 3    -15.2  3.70e- 52 -1.16e- 1 -8.27e- 2
model_data <- model_data %>%
  mutate(source_verified = as.numeric(source_verified), 
  classification = as.numeric(classification),
  trustworthySources = as.numeric(trustworthySources)) %>%
  mutate(across(c(classification, trustworthySources, word_count, source_friends_count, source_followers_count, source_verified, account_age), scale, .names = "{.col}_z"))

model_votes <- glm.nb(total_votes ~ classification_z + trustworthySources_z + word_count_z + source_friends_count_z +
  source_followers_count_z + source_verified_z + account_age_z, data = model_data)

model_summary_votes <- tidy(model_votes, conf.int = TRUE, conf.level = 0.99)
print(model_summary_votes)
## # A tibble: 8 × 7
##   term                 estimate std.error statistic   p.value conf.low conf.high
##   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)            3.07     0.00564    544.   0          3.05       3.08  
## 2 classification_z       0.0982   0.00558     17.6  2.61e- 69  0.0831     0.113 
## 3 trustworthySources_z   0.0302   0.00629      4.80 1.55e-  6  0.0136     0.0469
## 4 word_count_z           0.0272   0.00635      4.28 1.84e-  5  0.00998    0.0444
## 5 source_friends_coun…   0.0240   0.00569      4.22 2.39e-  5  0.00627    0.0426
## 6 source_followers_co…   0.0103   0.00581      1.77 7.69e-  2 -0.00696    0.0278
## 7 source_verified_z      0.308    0.00602     51.1  0          0.293      0.323 
## 8 account_age_z         -0.135    0.00588    -22.9  2.07e-116 -0.150     -0.119
helpfullness_ratio_summary <- helpfullness_ratio_summary %>% mutate(DV = "Helpfulness Ratio")
model_summary_votes <- model_summary_votes |> mutate(DV = "Total Votes")

combined_summary <- bind_rows(helpfullness_ratio_summary, model_summary_votes)

combined_summary <- combined_summary |>
  filter(term != "(Intercept)") |>
  mutate(
    term = case_when(
      grepl("classification", term) ~ "Misleading",
      grepl("trustworthySources", term) ~ "Trustworthy Sources",
      grepl("word_count", term) ~ "Word Count",
      grepl("source_friends_count", term) ~ "Source Friends Count",
      grepl("source_followers_count", term) ~ "Source Followers Count",
      grepl("source_verified", term) ~ "Source Verified",
      grepl("account_age", term) ~ "Source Account Age",
      TRUE ~ term
    )
  )

  ggplot(combined_summary, aes(x = estimate, y = term, color = DV)) +
  geom_point(position = position_dodge(width = 0.5)) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2, position = position_dodge(width = 0.5)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
  labs(title = "Figure 10: Regression Results", x = "Standardized Coefficient Estimate", y = NULL, color = "Dependent Variable") +
  theme_minimal() +
  coord_flip() +
  scale_y_discrete(labels = c(
  "classificationNOT_MISLEADING" = "Misleading",
  "trustworthySources" = "Trustworthy Sources",
  "word_count" = "Word Count",
  "account_age" = "Account Age",
  "source_friends_count" = "Friends Count",
  "source_followers_count" = "Followers Count",
  "source_verifiedTRUE" = "Verified"
    )) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))