RではじめるTwitter解析

R Twitter
R 2011 (2011/11/26)
@a_bicky

• Takeshi Arabiki
‣

‣ Twitter & : @a_bicky & id:a_bicky

•
R

•
http://d.hatena.ne.jp/a_bicky/

R
Osaka.R #4 Tokyo.R #16 Tsukuba.R #9

http://www.slideshare.net/abicky/twitterr http://www.slideshare.net/abicky/r-9034336 http://www.slideshare.net/abicky/r-10128090

http://twilog.org/ http://twitraq.userlocal.jp/

http://whotwi.com/
http://tweetstats.com/

http://twilog.org/ http://twitraq.userlocal.jp/

R

http://whotwi.com/
http://tweetstats.com/

Twitter

•
• reshape2
• ggplot2
•

twitteR
twitteR
> library(twitteR) # twitteR
> # (twitteR 0.99.15 )
> Sys.setlocale("LC_TIME", "C")
[1] "C"
> # @a_bicky 3,200 RT
> statuses <- userTimeline("a_bicky", n = 3200)

status
> # R5
> ls.str(statuses[[1]])
created : POSIXct[1:1], format: "2011-11-23 22:16:24"
favorited : logi FALSE ↑ UTC
id : chr "139467359571296256"
initFields : Formal class 'refMethodDef' [package "methods"]
with 5 slots
initialize : Formal class 'refMethodDef' [package "methods"]
with 5 slots
replyToSID : chr(0)
replyToSN : chr(0)
replyToUID : chr(0)
screenName : chr "a_bicky" ! Twitter
statusSource : chr "<a href="http://sites.google.com/site/
yorufukurou/" rel="nofollow">YoruFukurou</a>"
text : chr " "
truncated : logi FALSE ↑

> statusDF <- twListToDF(statuses)
> str(statusDF, vec.len = 1)
'data.frame': 3159 obs. of 10 variables:
$ text : chr "
" ... ↑

$ favorited : logi FALSE ...
$ replyToSN : logi NA ...
$ created : POSIXct, format: "2011-11-23 22:16:24" ...
$ truncated : logi FALSE ... ↑ UTC
$ replyToSID : logi NA ...
$ id : chr "139467359571296256" ...
$ replyToUID : logi NA ...
$ statusSource: chr "<a href="http://sites.google.com/
site/yorufukurou/" rel="nofollow">YoruFukurou</a>" ...
$ screenName : chr "a_bicky" ...

> wday.abb <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
> statusDF <- within(statusDF, {
+ attr(created, "tzone") <- "Asia/Tokyo" # JST
+ statusSource <- factor(gsub("<a .*?>(.*?)</a>", "1",
statusSource)) # HTML
+ date <- factor(format(created, "%Y-%m-%d")) #
+ hour <- NULL; month <- NULL; year <- NULL; wday <- NULL
+ with(as.POSIXlt(created), {
+ hour <<- factor(hour) #
+ month <<- factor(mon + 1) #
+ year <<- factor(year + 1900) #
+ wday <<- factor((wday + 6) %% 7, labels = wday.abb) #
+ })
+ textLength <- nchar(text) #
+ # , URL,
+ cleanText <- removeSpecialStr(text)
+ cleanTextLength <- nchar(cleanText) # URL
+ })

> # Twitter
> topSources <- names(head(sort(table(statusDF$statusSource),
decreasing = TRUE), 5))
> statusDF <- within(statusDF, {
+ statusSource <- as.character(statusSource)
+ statusSource[!statusSource %in% topSources] <- "other"
+ #
+ statusSource <- factor(statusSource, levels = names(sort(table
(statusSource), dec = TRUE)))
+ })

Excel

9 11 ”Twitter for iPhone”, ”YoruFukurou”
Sat Mon 12 23

reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+ measure.vars = c("textLength")),
+ month + statusSource ~ wday, mean,
+ subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+ & month %in% 9:11 & hour %in% 12:23
+ & wday %in% c("Mon", "Sat", "Sun")))
Mon Sat Sun
9_YoruFukurou 43 42.13333 54.76471
9_Twitter for iPhone 16 27.70000 20.50000
10_YoruFukurou 61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou 35 41.08197 57.32609
11_Twitter for iPhone NaN NaN 32.00000

reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+ measure.vars = c("textLength")),
+ month + statusSource ~ wday, mean,
+ subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+ & month %in% 9:11 & hour %in% 12:23
+ & wday %in% c("Mon", "Sat", "Sun")))
Mon Sat Sun
9_YoruFukurou 43 42.13333 54.76471
9_Twitter for iPhone 16 27.70000 20.50000
10_YoruFukurou 61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou 35 41.08197 57.32609
11_Twitter for iPhone NaN NaN 32.00000

R

reshape2 melt
melt cast
melt
cast
> mstatus <- melt(statusDF,
+ id.vars = c("statusSource", "wday", "year", "month", "hour", "date"),
+ measure.vars = c("textLength", "cleanTextLength"))
> mstatus[3157:3162, ]
statusSource wday year month hour date variable value
3157 web Sun 2011 3 20 2011-03-13 textLength 72
3160 YoruFukurou Wed 2011 11 1 2011-11-23 cleanTextLength 87

id

reshape2 cast
cast
formula fun.aggregate
> args(acast) # array acast
function (data, formula, fun.aggregate = NULL, ..., margins = NULL,
subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data))
NULL
> args(dcast) # data.frame dcast
function (data, formula, fun.aggregate = NULL, ..., margins = NULL,
subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data))
NULL

formula
...
.
acast hoge ~ fuga ~ piyo
※dcast 1 hoge ~ fuga + piyo

> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
↑ cleanTextLength

> #
Mon Tue Wed Thu Fri Sat Sun
[1,] 408 360 258 294 334 801 704
>

> #
[1,] 408 360 258 294 334 801 704
> #
> acast(mstatus, hour ~ wday, length, subset = .(variable ==
"textLength"))

> #
[1,] 408 360 258 294 334 801 704
> #
> acast(mstatus, hour ~ wday, length, subset = .(variable ==
"textLength"))
0 65 69 26 46 46 49 40
1 48 19 11 15 27 44 37
2 31 24 6 16 17 23 17
3 27 19 4 11 14 17 10
4 4 15 1 7 4 5 7
5 5 11 1 4 3 4 5
6 4 14 3 6 9 8 1

> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))

> #
> #
"textLength"))
Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0 3 4 13 3 1 10 7 15 9 4 2
1 0 0 1 0 1 9 16 12 9 1 0
2 2 0 0 0 2 7 6 7 7 2 0

> #
> #
"textLength"))
0 3 4 13 3 1 10 7 15 9 4 2
1 0 0 1 0 1 9 16 12 9 1 0
2 2 0 0 0 2 7 6 7 7 2 0

> # 3
> acast(mstatus, hour ~ wday ~ month, length, subset = .(variable ==
"textLength"))

> #
> #
"textLength"))
0 3 4 13 3 1 10 7 15 9 4 2
1 0 0 1 0 1 9 16 12 9 1 0
2 2 0 0 0 2 7 6 7 7 2 0

> # 3
"textLength"))
, , 3

0 3 4 1 0 1 6 4
1 0 1 3 0 0 0 1

> #
> #
"textLength"))
0 3 4 13 3 1 10 7 15 9 4 2
1 0 0 1 0 1 9 16 12 9 1 0
2 2 0 0 0 2 7 6 7 7 2 0

> # 3
"textLength"))
, , 3 3

0 3 4 1 0 1 6 4
1 0 1 3 0 0 0 1

Twitter
reshape2 1

> #
> dcast(mstatus, statusSource ~ .,
+ function(x) list(c(mean = mean(x), sd = sd(x))),
+ fill = list(c(mean = NaN, sd = NA)), ←
+ subset = .(variable == "textLength"))

Twitter
reshape2 1

> #
> dcast(mstatus, statusSource ~ .,
+ function(x) list(c(mean = mean(x), sd = sd(x))),
+ fill = list(c(mean = NaN, sd = NA)), ←
+ subset = .(variable == "textLength"))
statusSource NA
1 YoruFukurou 47.51462, 32.57973
2 web 57.02720, 36.33534
3 Twitter for iPhone 33.42342, 23.06466
4 Twitter for Android 28.49048, 20.08457
5 Hatena 80.00000, 25.94212
6 other 52.58621, 33.12180
>

> # t
> pc <- unlist(subset(statusDF,
+ statusSource %in% c("YoruFukurou", "web"),
+ textLength))
> sp <- unlist(subset(statusDF,
+ grepl("(iPhone|Android)", statusSource),
+ textLength))
> t.test(sp, pc, var.equal = FALSE)

Welch Two Sample t-test
!!
data: sp and pc
t = -15.7921, df = 1588.246, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-19.85334 -15.46645
sample estimates:
mean of x mean of y
31.83945 49.49935

> extractScreenNames <- function(text, strict = TRUE) {
+ if (strict) {
+ # Twitter screen_name
+ regex <- "(?:(?<!w)([@ ])((?>w+))(?![@ ])|[sS])"
+ } else {
+ # hoge@example.com
+ regex <- "(?:([@ ])(w+)|[sS])"
+ }
+ screenNames <- gsub(regex, "12", text, perl = TRUE)
+ unique(unlist(strsplit(substring(screenNames, 2), "[@ ]")))
+ }
> screenNames <- unlist(lapply(statusDF$text, extractScreenNames))
> head(sort(table(screenNames), decreasing = TRUE), 10) # Top 10
screenNames
naopr __gfx__ hirota_inoue mandy_44 ask_a_lie
105 85 51 47 40
ken_nishi nokuno yokkuns JinJin0613 kanon19_rie
39 39 33 20 20

ggplot2
plot(statusDF$wday, col = "blue")
ggplot2

qplot(wday, data = statusDF, fill = I("blue"),
alpha = I(0.7), xlab = "", ylab = "")

ggplot2

qplot(wday, data = statusDF, fill = statusSource,
xlab = "", ylab = "")

ggplot2
qplot(wday, data = statusDF, facets = ~ statusSource,
fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "")

xlab = "", ylab = "")

qplot
ggplot2
> args(qplot)
function (x, y = NULL, z = NULL, ..., data, facets = . ~ ., margins =
FALSE,
geom = "auto", stat = list(NULL), position = list(NULL),
xlim = c(NA, NA), ylim = c(NA, NA), log = "", main = NULL,
xlab = deparse(substitute(x)), ylab = deparse(substitute(y)),
asp = NA)
NULL

qplot geom
geom

area:
bar:
histogram:
line:
point:

qplot geom
geom

area:
bar:
histogram:
line:
point:

qplot(as.integer(wday), data = statusDF, geom = "area", stat = "bin",
fill = statusSource, xlab = "", ylab = "", binwidth = 1)

qplot geom
geom

area:
bar:
histogram:
line:
point:

qplot(wday, data = statusDF, geom = "bar", stat = "bin",
fill = statusSource, xlab = "", ylab = "")

qplot geom
geom

area:
bar:
histogram:
line:
point:

qplot(as.integer(wday), data = statusDF, geom = "line", stat = "bin",
colour = statusSource, xlab = "", ylab = "", binwidth = 1)

qplot geom
geom

area:
bar:
histogram:
line:
point:

qplot(wday, data = statusDF, geom = "point", stat = "bin",
colour = statusSource, xlab = "", ylab = "")

qplot position
position geom

dodge :
fill : 1
jitter :
stack :

qplot position
position geom

dodge :
fill : 1
jitter :
stack :

position = "dodge", xlab = "", ylab = "")

qplot position
position geom

dodge :
fill : 1
jitter :
stack :

position = "fill", xlab = "", ylab = "")

qplot position
position geom

dodge :
fill : 1
jitter :
stack :

position = "jitter", xlab = "", ylab = "")

qplot position
position geom

dodge :
fill : 1
jitter :
stack :

position = "stack", xlab = "", ylab = "")

qplot facets
facets geom
~ :
1 ~ 2: 1, 2
※reshape2 1 ~ 2 + 3

qplot facets
facets geom
~ :
1 ~ 2: 1, 2
※reshape2 1 ~ 2 + 3

qplot(wday, data = statusDF, xlab = "", ylab = "",
facets = ~ statusSource)

qplot facets
facets geom
~ :
1 ~ 2: 1, 2
※reshape2 1 ~ 2 + 3

facets = month ~ statusSource)

qplot
alpha :
colour (color) :
fill :
linetype :
size :

colour, ﬁll, linetype statusSource
ﬁll = I("blue") I (AsIs)

qplot
alpha :
colour (color) :
fill :
linetype :
size :

alpha = as.integer(wday))

qplot
alpha :
colour (color) :
fill :
linetype :
size :

colour = statusSource)

qplot
alpha :
colour (color) :
fill :
linetype :
size :

fill = statusSource)

qplot
alpha :
colour (color) :
fill :
linetype :
size :

linetype = statusSource, colour = statusSource)

whotwi

http://whotwi.com/

whotwi
> # Twitter
> # melt cast xtabs
> cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF))
> head(cnt, 3)
hour wday statusSource Freq
1 0 Mon YoruFukurou 48

whotwi
> # Twitter
> # melt cast xtabs
> cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF))
> head(cnt, 3)
hour wday statusSource Freq
> freqSources <- by(cnt, cnt[c("hour", "wday")], function(df) {
+ #
+ freqSource <- with(df, statusSource[order(Freq, decreasing = TRUE)
[1]])
+ cbind(df[1, c("hour", "wday")], freqSource)
+ })
> freqSources <- do.call(rbind, freqSources)
> head(freqSources, 3)
hour wday freqSource
1 0 Mon YoruFukurou
2 1 Mon YoruFukurou
3 2 Mon YoruFukurou

whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
hour wday Freq
1 0 Mon 65
2 1 Mon 48
3 2 Mon 31

whotwi
> #
> head(cntSum, 3)
hour wday Freq
1 0 Mon 65
2 1 Mon 48
3 2 Mon 31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)

whotwi
> #
> head(cntSum, 3)
hour wday Freq
1 0 Mon 65
2 1 Mon 48
3 2 Mon 31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)
> p <- qplot(hour, wday, data = data, xlab = "", ylab = "",
+ geom = "point", colour = freqSource, size = Freq)
> p # print(p)

whotwi
> # whotwi theme
> theme_whotwi <- function() {
+ opts( #
+ panel.background = theme_rect(fill = NA, colour = NA),
+ #
+ legend.key = theme_rect(fill = NA, colour = NA),
+ #
+ axis.ticks = theme_segment(colour = NA))
+ }
> p2 <- p + theme_whotwi() + scale_size(legend = FALSE) +
scale_colour_hue(name = "")
> p2

RMeCab
MeCab R

> library(RMeCab)
> (docDF(data.frame(" "), column = 1, type = 1))
number of extracted terms = 5
now making a data frame. wait a while!

TERM POS1 POS2 Row1
1 1
2 1
3 1
4 2
5 2

http://www.lr.pi.titech.ac.jp/~takamura/pndic_ja.html
: : :1
: : :0.999995
: : :0.999979
: : :0.999979
: : :0.999645
: : :0.999486
: : :0.999314
...

> #
> pndic <- read.table("http://www.lr.pi.titech.ac.jp/~takamura/pubs/
pn_ja.dic",
+ sep = ":",
+ col.names = c("term", "kana", "pos", "value"),
+ colClasses = c("character", "character", "factor",
"numeric"),
+ fileEncoding = "Shift_JIS")
> #
> #
> pndic2 <- aggregate(value ~ term + pos, pndic, mean)

> # pndic
> pos <- unique(pndic2$pos)
> tweetDF <- docDF(statusDF, column = "cleanText", type = 1, pos = pos)
number of extracted terms = 7164
now making a data frame. wait a while!

> tweetDF[2900:2904, 1:5]
TERM POS1 POS2 Row1 Row2
2900 0 0
2901 0 0
2902 0 0
2903 0 0
2904 0 0
> # pndic
> tweetDF <- subset(tweetDF, TERM %in% pndic2$term)
> #
> tweetDF <- merge(tweetDF, pndic2, by.x = c("TERM", "POS1"), by.y = c
("term", "pos"))

> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117

> #
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765

> #
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> sum(score == 0)
[1] 277

> table(ifelse(pndic$value > 0, "positive",
+ ifelse(pndic$value == 0, "neutral", "negative")))

negative neutral positive
49983 20 5122

> m <- mean(score)
> #
> tweetType <- factor(ifelse(score > m, "positive",
+ ifelse(score == m, "neutral", "negative")),
+ levels = c("positive", "neutral", "negative"))
> table(tweetType)
tweetType
positive neutral negative
1912 0 1247

> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+ geom = "bar", fill = tweetType, position = "fill")

twitteR
• RJSONIO
•
• ID status ID
• fav favorited TRUE
• truncated TRUE
• DM
• status
character factor

OAuth ” ” twitteR -

• twitteR
• reshape2 R

• ggplot2

• RMeCab R

• twitteR
• reshape2 R

• ggplot2

• RMeCab R

• PC
•

https://github.com/abicky/rjpusers2011_abicky

status
> statuses[[1]]$text
[1] " "
> statuses[[1]]$getText() #
[1] " "
> #
> statuses[[1]]$text <- " "
> statuses[[1]]$getText()
[1] " "
> statuses[[1]]$setText("ggrks") #
> statuses[[1]]$getText()
[1] "ggrks"
> #
> statuses[[1]]$getCreated()
[1] "2011-11-23 22:16:24 UTC"

removeSpecialStr

removeSpecialStr <- function(text) {
removeURL(removeHashTag(removeScreenName(text)))
}

removeScreenName

removeScreenName <- function(text, strict = TRUE) {
if (strict) {
regex <- "(?<!w)[@ ](?>w+)(?![@ ])"
} else {
regex <- "[@ ]w+"
}
gsub(regex, "", text, perl = TRUE)
}

removeURL

removeURL <- function(text, strict = TRUE) {
if (strict) {
regex <- "(?<![-.w#@=!'"/])https?://(?:[^:]+:.
+@)?(?:[0-9A-Za-z][-0-9A-Za-z]*(?<!-).)+[A-za-z]+(?:/[-
w#%=+,.?!&~]*)*"
} else {
regex <- "https?://[-w#%=+,.?!&~/]+"
}
gsub(regex, "", text, perl = TRUE)
}

removeHashTag

removeHashTag <- function(text, strict = TRUE) {
delimiters <- "s,.u3000-u3002uFF01uFF1F"
# cf. http://nobu666.com/2011/07/13/914.html
validJa <- "u3041-u3094u3099-u309Cu30A1-u30FA
u30FCu3400-uD7A3uFF10-uFF19uFF21-uFF3AuFF41-uFF5A
uFF66-uFF9E"
if (strict) {
regex <- sprintf("(^|[%s])(?:([# ](?>[0-9]+)(?!
w))|[# ][w%s]+)", delimiters, validJa, validJa)
} else {
regex <- sprintf("[# ][^%s]+", delimiters)
}
gsub(regex, "12", text, perl = TRUE)
}

RではじめるTwitter解析

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Viewers also liked

Viewers also liked (16)

Similar to RではじめるTwitter解析

Similar to RではじめるTwitter解析 (20)

More from Takeshi Arabiki

More from Takeshi Arabiki (15)

Recently uploaded

Recently uploaded (20)

RではじめるTwitter解析