### Load packages
# Standard
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
# Network specific
library(tidygraph) # For tidy-style graph manipulation
library(ggraph) # For ggplot2 style graph plotting
# EconGeo Specific
library(EconGeo) # Economc Geography functions
So, let’s start the fun. I for you extracted Chinese patents from our EPO PATSTAT databases filed at either the EPO or the USTPO. I further provide you adittional data. Lets take a look:
patents <- readRDS("data/CN_patent.rds")
inventors <- readRDS("data/CN_inventor.rds") %>% filter(str_detect(city, "china")) %>% mutate(city = city %>% str_remove("\\,.*"))
el_pat_inv <- readRDS("data/CN_el_inventor_patent.rds")
el_pat_tech <- readRDS("data/CN_el_patent_field.rds")
field_names <- read_csv("data/ipc_technology.csv") %>%
select(field_nr, sector, field_name) %>%
distinct(field_nr, .keep_all = TRUE) %>%
mutate(field_nr = field_nr) %>%
arrange(field_nr)
cities_geo <- inventors %>%
distinct(city, .keep_all = TRUE) %>%
select(city, lon, lat)
reg_pat <- el_pat_inv %>%
arrange(appln_id, invt_seq_nr) %>%
distinct(appln_id, .keep_all = TRUE) %>%
left_join(inventors %>% select(person_id, city), by = "person_id") %>%
left_join(patents %>% select(appln_id, appln_filing_year, nb_citing_docdb_fam), by = "appln_id") %>%
select(-invt_seq_nr, person_id)
Let’s not take a look at the complexity of Chinese patenting activity.
We will here use some neath functions from theR package EconGeo. Its only on github, so we have to install it from there.
#devtools::install_github("PABalland/EconGeo", force = T)
library(EconGeo)
First, here we will need to create some big matrices, therefore I will here already define a little helper function which takes an edgelist as input and produces really efficient a sparse M[i,j] 2-mode matrix.
## Helper function
create_sparse_matrix <- function(i.input, j.input){
require(Matrix)
mat <- spMatrix(nrow = i.input %>% n_distinct(),
ncol = j.input %>% n_distinct(),
i = i.input %>% factor() %>% as.numeric(),
j = j.input %>% factor() %>% as.numeric(),
x = rep(1, i.input %>% length() ) )
row.names(mat) <- i.input %>% factor() %>% levels()
colnames(mat) <- j.input %>% factor() %>% levels()
return(mat)
}
Now, we will create this sparse matrix between patents and their associated technology classes.
mat_tech <- create_sparse_matrix(i = el_pat_tech %>% pull(appln_id),
j = el_pat_tech %>% pull(techn_field_nr))
Reminding basic matrix algebra, a 2-mode matrix M[i,j] can be transformed to a 1-mode adjacency matrix M[j,j] by taking the dot-product (matrix multiplication) of its transposed version t(M[i,j]), and into a 1-mode adjacency matrix M[i,i] by taking the dot-product with itself. In summary: M[i,j] %*% M[i,j] = M[j,j] and M[i,j] %*% t(M[i,j]) = M[i,i].
We can do that very efficiently with the crossprod() and tcrossprod() function of the Matrix package.
mat_tech %<>%
crossprod() %>%
as.matrix()
#mat.pat <- mat.tech %>% tcrossprod() %>% as.matrix() # Would create a patent matrix based on shared IPC classes (Not a good idea here)
mat_tech
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1 9416 909 213 63 209 476 8 411 646 390 15 487 79 43 4
## 2 909 8670 629 572 193 2957 45 555 936 224 4 137 39 4 1
## 3 213 629 7477 4345 289 1083 80 27 174 274 3 134 20 3 0
## 4 63 572 4345 14724 303 2785 271 10 23 354 0 153 12 3 1
## 5 209 193 289 303 1817 418 3 50 15 96 0 90 5 3 0
## 6 476 2957 1083 2785 418 15567 497 255 430 597 20 479 147 19 22
## 7 8 45 80 271 3 497 747 0 2 30 2 85 7 1 0
## 8 411 555 27 10 50 255 0 4826 732 151 5 22 9 90 4
## 9 646 936 174 23 15 430 2 732 4246 195 3 24 48 22 3
## 10 390 224 274 354 96 597 30 151 195 3722 202 227 198 41 82
## 11 15 4 3 0 0 20 2 5 3 202 457 9 22 74 201
## 12 487 137 134 153 90 479 85 22 24 227 9 1824 44 4 2
## 13 79 39 20 12 5 147 7 9 48 198 22 44 1488 24 50
## 14 43 4 3 3 3 19 1 90 22 41 74 4 24 2852 346
## 15 4 1 0 1 0 22 0 4 3 82 201 2 50 346 1420
## 16 4 1 0 0 0 6 0 2 0 15 96 1 77 1283 623
## 17 101 26 0 0 0 6 0 72 62 15 9 2 28 180 31
## 18 5 0 0 0 0 3 0 0 1 3 3 2 7 64 119
## 19 116 28 0 0 1 9 1 143 114 27 22 2 28 390 104
## 20 183 26 10 0 0 12 0 71 31 15 5 4 17 59 6
## 21 220 129 29 1 7 59 1 221 162 40 7 6 29 30 9
## 22 133 39 6 1 1 14 0 156 52 58 18 0 22 17 26
## 23 136 33 4 2 0 12 0 46 34 138 65 20 64 284 65
## 24 23 9 1 0 3 11 0 20 3 56 6 6 49 19 19
## 25 55 65 20 1 1 55 2 36 31 60 4 76 36 3 3
## 26 91 113 15 0 1 33 1 59 64 59 1 19 25 5 3
## 27 145 72 4 1 1 51 5 21 21 77 4 30 58 3 1
## 28 28 23 18 2 4 41 0 19 25 14 3 2 13 16 15
## 29 120 67 22 2 1 27 4 33 76 19 5 9 49 25 31
## 30 96 320 7 1 2 59 0 161 16 28 3 33 22 6 1
## 31 114 222 40 3 3 125 1 22 46 45 5 55 43 2 1
## 32 172 33 15 15 2 63 3 9 16 70 4 84 15 0 0
## 33 94 267 35 26 3 166 4 13 23 32 0 39 54 1 0
## 34 112 122 33 10 16 97 1 34 32 39 0 34 107 5 1
## 35 68 139 56 6 2 122 0 3 6 56 3 46 11 2 0
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 1 4 101 5 116 183 220 133 136 23 55 91 145 28 120 96 114
## 2 1 26 0 28 26 129 39 33 9 65 113 72 23 67 320 222
## 3 0 0 0 0 10 29 6 4 1 20 15 4 18 22 7 40
## 4 0 0 0 0 0 1 1 2 0 1 0 1 2 2 1 3
## 5 0 0 0 1 0 7 1 0 3 1 1 1 4 1 2 3
## 6 6 6 3 9 12 59 14 12 11 55 33 51 41 27 59 125
## 7 0 0 0 1 0 1 0 0 0 2 1 5 0 4 0 1
## 8 2 72 0 143 71 221 156 46 20 36 59 21 19 33 161 22
## 9 0 62 1 114 31 162 52 34 3 31 64 21 25 76 16 46
## 10 15 15 3 27 15 40 58 138 56 60 59 77 14 19 28 45
## 11 96 9 3 22 5 7 18 65 6 4 1 4 3 5 3 5
## 12 1 2 2 2 4 6 0 20 6 76 19 30 2 9 33 55
## 13 77 28 7 28 17 29 22 64 49 36 25 58 13 49 22 43
## 14 1283 180 64 390 59 30 17 284 19 3 5 3 16 25 6 2
## 15 623 31 119 104 6 9 26 65 19 3 3 1 15 31 1 1
## 16 2290 66 73 202 15 14 18 19 5 2 1 1 3 29 1 2
## 17 66 1316 6 367 60 126 33 123 12 6 4 3 55 243 1 7
## 18 73 6 310 29 3 3 0 20 3 10 2 0 4 40 3 1
## 19 202 367 29 1661 161 158 46 220 48 10 23 13 45 80 27 10
## 20 15 60 3 161 1135 232 111 214 76 9 59 15 43 67 38 15
## 21 14 126 3 158 232 1541 107 112 15 52 106 18 85 222 19 24
## 22 18 33 0 46 111 107 554 43 5 2 8 2 34 34 3 2
## 23 19 123 20 220 214 112 43 1711 251 53 28 45 60 92 53 67
## 24 5 12 3 48 76 15 5 251 653 14 13 31 2 20 32 14
## 25 2 6 10 10 9 52 2 53 14 1150 86 7 32 57 7 113
## 26 1 4 2 23 59 106 8 28 13 86 1368 22 17 63 57 96
## 27 1 3 0 13 15 18 2 45 31 7 22 911 0 14 77 89
## 28 3 55 4 45 43 85 34 60 2 32 17 0 565 59 2 8
## 29 29 243 40 80 67 222 34 92 20 57 63 14 59 1102 14 24
## 30 1 1 3 27 38 19 3 53 32 7 57 77 2 14 1003 46
## 31 2 7 1 10 15 24 2 67 14 113 96 89 8 24 46 1512
## 32 2 17 0 6 6 17 1 14 13 66 28 51 4 31 18 131
## 33 2 9 18 6 4 14 2 56 9 96 43 17 8 44 36 196
## 34 2 15 0 12 17 69 9 41 5 58 28 14 67 41 68 43
## 35 0 14 1 24 15 33 3 59 27 61 37 33 3 44 20 183
## 32 33 34 35
## 1 172 94 112 68
## 2 33 267 122 139
## 3 15 35 33 56
## 4 15 26 10 6
## 5 2 3 16 2
## 6 63 166 97 122
## 7 3 4 1 0
## 8 9 13 34 3
## 9 16 23 32 6
## 10 70 32 39 56
## 11 4 0 0 3
## 12 84 39 34 46
## 13 15 54 107 11
## 14 0 1 5 2
## 15 0 0 1 0
## 16 2 2 2 0
## 17 17 9 15 14
## 18 0 18 0 1
## 19 6 6 12 24
## 20 6 4 17 15
## 21 17 14 69 33
## 22 1 2 9 3
## 23 14 56 41 59
## 24 13 9 5 27
## 25 66 96 58 61
## 26 28 43 28 37
## 27 51 17 14 33
## 28 4 8 67 3
## 29 31 44 41 44
## 30 18 36 68 20
## 31 131 196 43 183
## 32 1122 67 23 54
## 33 67 1561 86 130
## 34 23 86 1048 35
## 35 54 130 35 1330
This matrix can be used for the relatedness() function to transform the co-occurence adjacency matrix to a similarity matrix. This function computes the relatedness (Hidalgo et al., 2007; Boschma et al., 2015; Balland, 2016) between entities (industries, technologies,…) from their co-occurrence (adjacency) matrix. Different normalization procedures are proposed following van Eck and Waltman (2009): association strength, cosine, Jaccard, and an adapted version of the association strength (probability index). I here choose the cosine similarity, since it is not so sensitive to scale.
mat_tech_rel <- mat_tech %>%
relatedness(method = "cosine")
Lets check what happened?
mat_tech_rel[1:10,1:10]
## 1 2 3 4 5 6
## 1 0.000000000 0.12444247 0.031647291 0.008611730 0.065189691 0.05833133
## 2 0.124442471 0.00000000 0.076368437 0.063892840 0.049192202 0.29610968
## 3 0.031647291 0.07636844 0.000000000 0.526742229 0.079944574 0.11770148
## 4 0.008611730 0.06389284 0.526742229 0.000000000 0.077112838 0.27846562
## 5 0.065189691 0.04919220 0.079944574 0.077112838 0.000000000 0.09536861
## 6 0.058331334 0.29610968 0.117701481 0.278465617 0.095368611 0.00000000
## 7 0.003189473 0.01466046 0.028286382 0.088155542 0.002226816 0.14493786
## 8 0.090412297 0.09976656 0.005267541 0.001794887 0.020478111 0.04103197
## 9 0.131096548 0.15521758 0.031316016 0.003808360 0.005667405 0.06382984
## 10 0.080553306 0.03780708 0.050191230 0.059658653 0.036916814 0.09019649
## 7 8 9 10
## 1 0.0031894727 0.090412297 0.1310965481 0.08055331
## 2 0.0146604648 0.099766557 0.1552175800 0.03780708
## 3 0.0282863822 0.005267541 0.0313160155 0.05019123
## 4 0.0881555416 0.001794887 0.0038083600 0.05965865
## 5 0.0022268162 0.020478111 0.0056674045 0.03691681
## 6 0.1449378553 0.041031973 0.0638298363 0.09019649
## 7 0.0000000000 0.000000000 0.0009658712 0.01474587
## 8 0.0000000000 0.000000000 0.1950549954 0.04095274
## 9 0.0009658712 0.195054995 0.0000000000 0.04878808
## 10 0.0147458733 0.040952740 0.0487880757 0.00000000
Now, at the row-column intercept, we will not have the number of co-occurences, but the relatedness index instead.
With this matrix, we can now create the Chinese “Technology Space” a’la Hidalgo & Hausmann (2007). Here we create a network of technology fields. I will delete below average weights to create some sparsity, and calculate the eigenvector centrality. I will here use the Eigenvector centrality as an easy proxi for technological complexity.
library(tidygraph)
g_tech <- mat_tech_rel %>% as_tbl_graph(directed = FALSE) %N>%
left_join(field_names %>% mutate(field_nr = field_nr %>% as.character()), by = c("name" = "field_nr")) %>%
mutate(dgr = centrality_eigen(weights = weight)) %E>%
filter(weight >= mean(weight))
g_tech
## # A tbl_graph: 35 nodes and 177 edges
## #
## # An undirected simple graph with 1 component
## #
## # Edge Data: 177 x 3 (active)
## from to weight
## <int> <int> <dbl>
## 1 1 2 0.124
## 2 1 3 0.0316
## 3 1 5 0.0652
## 4 1 6 0.0583
## 5 1 8 0.0904
## 6 1 9 0.131
## # … with 171 more rows
## #
## # Node Data: 35 x 4
## name sector field_name dgr
## <chr> <chr> <chr> <dbl>
## 1 1 Electrical engineering Electrical machinery, apparatus, energy 0.731
## 2 2 Electrical engineering Audio-visual technology 0.895
## 3 3 Electrical engineering Telecommunications 0.825
## # … with 32 more rows
We will straight away visualize it as a network. We scale the nodes by their centrality, and color them by their higher level tech-sector.
In order to be able to reproduce the same node position in comparative graphs lateron, we will first save the node layout (classical fruchterman-reingold layout).
coords_tech <- g_tech %>% igraph::layout.fruchterman.reingold() %>% as_tibble()
colnames(coords_tech) <- c("x", "y")
library(ggraph)
g_tech %>%
ggraph(layout = coords_tech) +
geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") +
geom_node_point(aes(colour = sector, size = dgr)) +
geom_node_text(aes(label = field_name), size = 4, repel = TRUE) +
theme_graph()
Lets see which tech-field is the most central (complex) one:
g_tech %N>%
arrange(desc(dgr)) %>%
as_tibble() %>%
slice(1:10)
And the least one…
g_tech %N>%
arrange(dgr) %>%
as_tibble() %>%
slice(1:10)
Optimally, one would have created this space on world-data, and not only Chinese patents, though. Let’s, now that we know the overall space, take a look at city level specialization and complexity.
We here first join our city level patent data with the technology fields. Since a patent can have multiple ones, we this time fractionalize it with a field_weight.
reg_tech <- reg_pat %>%
left_join(el_pat_tech %>% select(appln_id, techn_field_nr), by = "appln_id") %>%
group_by(appln_id) %>%
mutate(field_weight = 1 / n()) %>%
ungroup()
Now we first aggregate it on the whole time frame. We could for sure also do that sepperate for different years. We filtr our cities with
reg_tech %<>%
group_by(city, techn_field_nr) %>%
summarise(n_tech_reg = sum(field_weight)) %>%
ungroup() %>%
drop_na()
Now, we for the EconGeo package again have to transform it to a matrix. Again, we have to set the rownames, which are depreciated in the tibble
mat_reg_tech <- reg_tech %>%
arrange(techn_field_nr, city) %>%
pivot_wider(names_from = techn_field_nr, values_from = n_tech_reg, values_fill = list(n_tech_reg = 0))
rownames(mat_reg_tech) <- mat_reg_tech %>% pull(city)
mat_reg_tech %<>% select(-city) %>%
as.matrix() %>%
round()
Lets take a look:
mat_reg_tech[1:10, 1:10]
## 1 2 3 4 5 6 7 8 9 10
## baicheng 1 0 0 0 0 0 0 0 0 0
## baoding 1 0 0 0 1 0 0 1 2 3
## baoji 3 0 0 0 0 1 0 0 0 0
## baotou 1 0 0 0 0 1 0 0 0 0
## beijing 802 1452 1202 3114 230 4807 152 1519 1143 666
## cangzhou 0 0 0 0 0 0 0 0 0 0
## changchun 9 0 0 0 0 1 0 10 3 7
## changsha 10 1 1 2 0 9 0 1 2 8
## changshu 5 4 1 0 0 1 0 1 1 0
## changzhou 20 1 0 4 0 5 0 2 1 8
Now, we can use the location.quotient() function from EconGeo. This function computes location quotients from (incidence) regions - industries matrices. The numerator is the share of a given industry in a given region. The denominator is the share of a this industry in a larger economy (overall country for instance). This index is also referred to as the index of Revealed Comparative Advantage (RCA) following Ballasa (1965), or the Hoover-Balassa index. To make it simple, we create a binary one, only indicating if the city has a RCA in the associated field or not. Afterwards, we transform it back to a tibble, set rownames again in a column, and gather it from a long to wide format.
reg_RCA <- mat_reg_tech %>% location.quotient(binary = TRUE) %>%
as.data.frame() %>%
rownames_to_column("city") %>%
as_tibble() %>%
gather(key = "tech_field", value = "RCA", -city) %>%
arrange(city, tech_field)
reg_RCA %>% head()
The Herfindahl() function computes the Herfindahl index from regions - industries matrices from (incidence) regions - industries matrices. It is a measure of concentration. This index is also known as the Herfindahl-Hirschman index (Herfindahl, 1959; Hirschman, 1945).
mat_reg_tech %>%
Herfindahl() %>%
as.data.frame() %>%
rownames_to_column("city") %>%
rename(HH = ".") %>%
arrange(desc(HH)) %>%
head(10)
Next, we can compute the Shannon entropy index (Shannon and Weaver, 1949; Frenken et al., 2007) from regions - industries matrices from (incidence) regions - industries matrices.
mat_reg_tech %>%
entropy() %>%
as.data.frame() %>%
rownames_to_column("city") %>%
rename(SE = ".") %>%
arrange(desc(SE)) %>%
head(10)
Now, finally, we can inspect where in the technology space different cities are specialized. We select some of interest.
city_select <- c("beijing", "shanghai", "shenzhen")
In the following, I plot the technology-space and highlights in red the fields where the city has an RCA.
i = 1
g_tech %N>%
left_join(reg_RCA %>% filter(city == city_select[i]) %>% select(-city), by = c("name" = "tech_field")) %>%
ggraph(layout = coords_tech) +
geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") +
geom_node_point(aes(colour = RCA, size = dgr)) +
geom_node_text(aes(label = field_name), size = 5, repel = TRUE) +
scale_color_gradient(low = "skyblue", high = "red") +
theme_graph() +
ggtitle(paste("Technology Space: RCA", city_select[i], sep = " "))
i = 2
g_tech %N>%
left_join(reg_RCA %>% filter(city == city_select[i]) %>% select(-city), by = c("name" = "tech_field")) %>%
ggraph(layout = coords_tech) +
geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") +
geom_node_point(aes(colour = RCA, size = dgr)) +
geom_node_text(aes(label = field_name), size = 5, repel = TRUE) +
scale_color_gradient(low = "skyblue", high = "red") +
theme_graph() +
ggtitle(paste("Technology Space: RCA", city_select[i], sep = " "))
i = 3
g_tech %N>%
left_join(reg_RCA %>% filter(city == city_select[i]) %>% select(-city), by = c("name" = "tech_field")) %>%
ggraph(layout = coords_tech) +
geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") +
geom_node_point(aes(colour = RCA, size = dgr)) +
geom_node_text(aes(label = field_name), size = 5, repel = TRUE) +
scale_color_gradient(low = "skyblue", high = "red") +
theme_graph() +
ggtitle(paste("Technology Space: RCA", city_select[i], sep = " "))
So, what differences do we see?
You can find more info about:
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/atlas-base/atlas/libblas.so.3.0
## LAPACK: /usr/lib/atlas-base/atlas/liblapack.so.3.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Matrix_1.2-17 EconGeo_1.3 ggraph_2.0.2 tidygraph_1.1.2
## [5] magrittr_1.5 forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5
## [9] purrr_0.3.3 readr_1.3.1 tidyr_1.0.2 tibble_2.1.3
## [13] ggplot2_3.3.0 tidyverse_1.3.0 knitr_1.28
##
## loaded via a namespace (and not attached):
## [1] ggrepel_0.8.2 Rcpp_1.0.4 lubridate_1.7.4 lattice_0.20-38
## [5] utf8_1.1.4 assertthat_0.2.1 digest_0.6.25 ggforce_0.3.1
## [9] R6_2.4.1 cellranger_1.1.0 backports_1.1.5 reprex_0.3.0
## [13] evaluate_0.14 httr_1.4.1 pillar_1.4.3 rlang_0.4.5
## [17] readxl_1.3.1 rstudioapi_0.11 rmarkdown_2.1 labeling_0.3
## [21] igraph_1.2.5 polyclip_1.10-0 munsell_0.5.0 broom_0.5.5
## [25] compiler_3.6.0 modelr_0.1.6 xfun_0.12 pkgconfig_2.0.3
## [29] htmltools_0.4.0 tidyselect_1.0.0 gridExtra_2.3 graphlayouts_0.6.0
## [33] viridisLite_0.3.0 fansi_0.4.1 crayon_1.3.4 dbplyr_1.4.2
## [37] withr_2.1.2 MASS_7.3-51.4 grid_3.6.0 nlme_3.1-139
## [41] jsonlite_1.6.1 gtable_0.3.0 lifecycle_0.2.0 DBI_1.1.0
## [45] scales_1.1.0 cli_2.0.2 stringi_1.4.6 farver_2.0.3
## [49] viridis_0.5.1 fs_1.3.2 xml2_1.2.5 ellipsis_0.3.0
## [53] generics_0.0.2 vctrs_0.2.4 tools_3.6.0 glue_1.3.2
## [57] tweenr_1.0.1 hms_0.5.3 yaml_2.2.1 colorspace_1.4-1
## [61] rvest_0.3.5 haven_2.2.0