SlideShare a Scribd company logo
1 of 131
Download to read offline
R       Twitter
    R      2011 (2011/11/26)
                  @a_bicky
• Takeshi Arabiki
    ‣

    ‣ Twitter &          : @a_bicky & id:a_bicky

•
                                R

•
                  http://d.hatena.ne.jp/a_bicky/
R
           Osaka.R #4                               Tokyo.R #16                               Tsukuba.R #9




http://www.slideshare.net/abicky/twitterr   http://www.slideshare.net/abicky/r-9034336 http://www.slideshare.net/abicky/r-10128090
※
Twitter
Mentionmapp
Mentionmapp
Mentionmapp
http://twilog.org/   http://twitraq.userlocal.jp/




http://whotwi.com/
                                 http://tweetstats.com/
http://twilog.org/   http://twitraq.userlocal.jp/




    R

http://whotwi.com/
                                 http://tweetstats.com/
Twitter


•
•             reshape2
•               ggplot2
•
Twitter


•
•             reshape2
•               ggplot2
•
twitteR
      twitteR
> library(twitteR) # twitteR
> #                          (twitteR 0.99.15     )
> Sys.setlocale("LC_TIME", "C")
[1] "C"
> # @a_bicky         3,200          RT
> statuses <- userTimeline("a_bicky", n = 3200)
status
> #             R5
> ls.str(statuses[[1]])
created : POSIXct[1:1], format: "2011-11-23 22:16:24"
favorited : logi FALSE           ↑            UTC
id : chr "139467359571296256"
initFields : Formal class 'refMethodDef' [package "methods"]
with 5 slots
initialize : Formal class 'refMethodDef' [package "methods"]
with 5 slots
replyToSID : chr(0)
replyToSN : chr(0)
replyToUID : chr(0)
screenName : chr "a_bicky"     ! Twitter
statusSource : chr "<a href="http://sites.google.com/site/
yorufukurou/" rel="nofollow">YoruFukurou</a>"
text : chr "                                               "
truncated :   logi FALSE             ↑
> statusDF <- twListToDF(statuses)
> str(statusDF, vec.len = 1)
'data.frame':	 3159 obs. of 10 variables:
 $ text        : chr "
         " ...                     ↑

 $ favorited   : logi FALSE ...
 $ replyToSN   : logi NA ...
 $ created     : POSIXct, format: "2011-11-23 22:16:24" ...
 $ truncated   : logi FALSE ...      ↑           UTC
 $ replyToSID : logi NA ...
 $ id          : chr "139467359571296256" ...
 $ replyToUID : logi NA ...
 $ statusSource: chr "<a href="http://sites.google.com/
site/yorufukurou/" rel="nofollow">YoruFukurou</a>" ...
 $ screenName : chr "a_bicky" ...
> wday.abb <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
> statusDF <- within(statusDF, {
+     attr(created, "tzone") <- "Asia/Tokyo" # JST
+     statusSource <- factor(gsub("<a .*?>(.*?)</a>", "1",
statusSource)) # HTML
+     date <- factor(format(created, "%Y-%m-%d")) #
+     hour <- NULL; month <- NULL; year <- NULL; wday <- NULL
+     with(as.POSIXlt(created), {
+         hour <<- factor(hour)         #
+         month <<- factor(mon + 1)     #
+         year <<- factor(year + 1900) #
+         wday <<- factor((wday + 6) %% 7, labels = wday.abb) #
+     })
+     textLength <- nchar(text) #
+     #        , URL,
+     cleanText <- removeSpecialStr(text)
+     cleanTextLength <- nchar(cleanText) # URL
+ })
> #                  Twitter
> topSources <- names(head(sort(table(statusDF$statusSource),
decreasing = TRUE), 5))
> statusDF <- within(statusDF, {
+     statusSource <- as.character(statusSource)
+     statusSource[!statusSource %in% topSources] <- "other"
+     #
+     statusSource <- factor(statusSource, levels = names(sort(table
(statusSource), dec = TRUE)))
+ })
Twitter


•
•             reshape2
•               ggplot2
•
reshape2
Excel




9   11   ”Twitter for iPhone”, ”YoruFukurou”
    Sat Mon 12         23
reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+             measure.vars = c("textLength")),
+       month + statusSource ~ wday, mean,
+       subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+         & month %in% 9:11 & hour %in% 12:23
+         & wday %in% c("Mon", "Sat", "Sun")))
                       Mon      Sat      Sun
9_YoruFukurou           43 42.13333 54.76471
9_Twitter for iPhone    16 27.70000 20.50000
10_YoruFukurou          61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou          35 41.08197 57.32609
11_Twitter for iPhone NaN       NaN 32.00000
reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+             measure.vars = c("textLength")),
+       month + statusSource ~ wday, mean,
+       subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+         & month %in% 9:11 & hour %in% 12:23
+         & wday %in% c("Mon", "Sat", "Sun")))
                       Mon      Sat      Sun
9_YoruFukurou           43 42.13333 54.76471
9_Twitter for iPhone    16 27.70000 20.50000
10_YoruFukurou          61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou          35 41.08197 57.32609
11_Twitter for iPhone NaN       NaN 32.00000
reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+             measure.vars = c("textLength")),
+       month + statusSource ~ wday, mean,
+       subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+         & month %in% 9:11 & hour %in% 12:23
+         & wday %in% c("Mon", "Sat", "Sun")))
                       Mon      Sat      Sun
9_YoruFukurou           43 42.13333 54.76471
9_Twitter for iPhone    16 27.70000 20.50000
10_YoruFukurou          61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou          35 41.08197 57.32609
11_Twitter for iPhone NaN       NaN 32.00000




         R
reshape2                                 melt
  melt                                    cast
   melt
cast
> mstatus <- melt(statusDF,
+    id.vars = c("statusSource", "wday", "year", "month", "hour", "date"),
+    measure.vars = c("textLength", "cleanTextLength"))
> mstatus[3157:3162, ]
      statusSource wday year month hour       date        variable value
3157           web Sun 2011      3   20 2011-03-13      textLength    72
3158           web Sun 2011      3   16 2011-03-13      textLength    24
3159           web Sun 2011      3   14 2011-03-13      textLength    82
3160 YoruFukurou Wed 2011       11    1 2011-11-23 cleanTextLength    87
3161 YoruFukurou Wed 2011       11    1 2011-11-23 cleanTextLength    14
3162 YoruFukurou Wed 2011       11    1 2011-11-23 cleanTextLength    21



              id
reshape2                                    cast
      cast
formula                                     fun.aggregate
> args(acast) #         array                       acast
function (data, formula, fun.aggregate    = NULL, ..., margins = NULL,
     subset = NULL, fill = NULL, drop =   TRUE, value_var = guess_value(data))
NULL
> args(dcast) #         data.frame                          dcast
function (data, formula, fun.aggregate    = NULL, ..., margins = NULL,
     subset = NULL, fill = NULL, drop =   TRUE, value_var = guess_value(data))
NULL


formula
...
.
acast     hoge ~ fuga ~ piyo
※dcast       1                            hoge ~ fuga + piyo
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
                                    ↑            cleanTextLength
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed Thu Fri Sat Sun
[1,] 408 360 258 294 334 801 704
>
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed Thu Fri Sat Sun
[1,] 408 360 258 294 334 801 704
>
> #
> acast(mstatus,   . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed   Thu Fri Sat Sun
[1,] 408 360 258   294 334 801 704
> #
> acast(mstatus,   hour ~ wday, length, subset = .(variable ==
"textLength"))
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed Thu Fri Sat Sun
[1,] 408 360 258 294 334 801 704
> #
> acast(mstatus, hour ~ wday, length, subset = .(variable ==
"textLength"))
   Mon Tue Wed Thu Fri Sat Sun
0   65 69 26 46 46 49 40
1   48 19 11 15 27 44 37
2   31 24    6 16 17 23 17
3   27 19    4 11 14 17 10
4    4 15    1   7   4   5   7
5    5 11    1   4   3   4   5
6    4 14    3   6   9   8   1
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0


> # 3
> acast(mstatus, hour ~ wday ~ month, length, subset = .(variable ==
"textLength"))
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0


> # 3
> acast(mstatus, hour ~ wday ~ month, length, subset = .(variable ==
"textLength"))
, , 3

    Mon Tue Wed Thu Fri Sat Sun
0     3   4   1   0   1   6   4
1     0   1   3   0   0   0   1
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0


> # 3
> acast(mstatus, hour ~ wday ~ month, length, subset = .(variable ==
"textLength"))
, , 3          3

    Mon Tue Wed Thu Fri Sat Sun
0     3   4   1   0   1   6   4
1     0   1   3   0   0   0   1
Twitter
                                               reshape2    1


> #
> dcast(mstatus, statusSource ~ .,
+       function(x) list(c(mean = mean(x), sd = sd(x))),
+       fill = list(c(mean = NaN, sd = NA)), ←
+       subset = .(variable == "textLength"))
Twitter
                                               reshape2    1


> #
> dcast(mstatus, statusSource ~ .,
+       function(x) list(c(mean = mean(x), sd = sd(x))),
+       fill = list(c(mean = NaN, sd = NA)), ←
+       subset = .(variable == "textLength"))
         statusSource                 NA
1         YoruFukurou 47.51462, 32.57973
2                 web 57.02720, 36.33534
3 Twitter for iPhone 33.42342, 23.06466
4 Twitter for Android 28.49048, 20.08457
5              Hatena 80.00000, 25.94212
6               other 52.58621, 33.12180
>
Twitter
                                               reshape2    1


> #
> dcast(mstatus, statusSource ~ .,
+       function(x) list(c(mean = mean(x), sd = sd(x))),
+       fill = list(c(mean = NaN, sd = NA)), ←
+       subset = .(variable == "textLength"))
         statusSource                 NA
1         YoruFukurou 47.51462, 32.57973
2                 web 57.02720, 36.33534
3 Twitter for iPhone 33.42342, 23.06466
4 Twitter for Android 28.49048, 20.08457
5              Hatena 80.00000, 25.94212
6               other 52.58621, 33.12180
>
>   #                     t
>   pc <- unlist(subset(statusDF,
+                       statusSource %in% c("YoruFukurou", "web"),
+                       textLength))
>   sp <- unlist(subset(statusDF,
+                       grepl("(iPhone|Android)", statusSource),
+                       textLength))
>   t.test(sp, pc, var.equal = FALSE)

	       Welch Two Sample t-test
                                        !!
data: sp and pc
t = -15.7921, df = 1588.246, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -19.85334 -15.46645
sample estimates:
mean of x mean of y
 31.83945 49.49935
>   #                     t
>   pc <- unlist(subset(statusDF,
+                       statusSource %in% c("YoruFukurou", "web"),
+                       textLength))
>   sp <- unlist(subset(statusDF,
+                       grepl("(iPhone|Android)", statusSource),
+                       textLength))
>   t.test(sp, pc, var.equal = FALSE)

	       Welch Two Sample t-test
                                        !!
data: sp and pc
t = -15.7921, df = 1588.246, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -19.85334 -15.46645
sample estimates:
mean of x mean of y
 31.83945 49.49935
> extractScreenNames <- function(text, strict = TRUE) {
+     if (strict) {
+         # Twitter     screen_name
+         regex <- "(?:(?<!w)([@ ])((?>w+))(?![@ ])|[sS])"
+     } else {
+         #       hoge@example.com
+         regex <- "(?:([@   ])(w+)|[sS])"
+     }
+     screenNames <- gsub(regex, "12", text, perl = TRUE)
+     unique(unlist(strsplit(substring(screenNames, 2), "[@ ]")))
+ }
> screenNames <- unlist(lapply(statusDF$text, extractScreenNames))
> head(sort(table(screenNames), decreasing = TRUE), 10) # Top 10
screenNames
        naopr     __gfx__ hirota_inoue     mandy_44    ask_a_lie
          105          85           51           47           40
    ken_nishi      nokuno      yokkuns   JinJin0613 kanon19_rie
           39          39           33           20           20
Twitter


•
•             reshape2
•               ggplot2
•
ggplot2
ggplot2
plot(statusDF$wday, col = "blue")
                                                                ggplot2




                                qplot(wday, data = statusDF, fill = I("blue"),
                                      alpha = I(0.7), xlab = "", ylab = "")
ggplot2




qplot(wday, data = statusDF, fill = statusSource,
      xlab = "", ylab = "")
ggplot2
                qplot(wday, data = statusDF, facets = ~ statusSource,
                      fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "")




qplot(wday, data = statusDF, fill = statusSource,
      xlab = "", ylab = "")
ggplot2
                qplot(wday, data = statusDF, facets = ~ statusSource,
                      fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "")




qplot(wday, data = statusDF, fill = statusSource,
      xlab = "", ylab = "")
qplot
      ggplot2
> args(qplot)
function (x, y = NULL, z = NULL, ..., data, facets = . ~ ., margins =
FALSE,
     geom = "auto", stat = list(NULL), position = list(NULL),
     xlim = c(NA, NA), ylim = c(NA, NA), log = "", main = NULL,
     xlab = deparse(substitute(x)), ylab = deparse(substitute(y)),
     asp = NA)
NULL
qplot   geom
       geom

area:
bar:
histogram:
line:
point:
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(as.integer(wday), data = statusDF, geom = "area", stat = "bin",
        fill = statusSource, xlab = "", ylab = "", binwidth = 1)
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(wday, data = statusDF, geom = "bar", stat = "bin",
        fill = statusSource, xlab = "", ylab = "")
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(as.integer(wday), data = statusDF, geom = "line", stat = "bin",
        colour = statusSource, xlab = "", ylab = "", binwidth = 1)
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(wday, data = statusDF, geom = "point", stat = "bin",
        colour = statusSource, xlab = "", ylab = "")
qplot            position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "dodge", xlab = "", ylab = "")
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "fill", xlab = "", ylab = "")
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "jitter", xlab = "", ylab = "")
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "stack", xlab = "", ylab = "")
qplot                           facets
    facets      geom
~           :
        1 ~       2:         1,         2
※reshape2              1 ~        2 +   3
qplot                                  facets
     facets       geom
~           :
         1 ~         2:            1,            2
※reshape2                 1 ~           2 +      3




    qplot(wday, data = statusDF, xlab = "", ylab = "",
          facets = ~ statusSource)
qplot                                  facets
     facets       geom
~           :
         1 ~         2:            1,            2
※reshape2                 1 ~           2 +      3




    qplot(wday, data = statusDF, xlab = "", ylab = "",
          facets = month ~ statusSource)
qplot
alpha               :
colour (color) :
fill                :
linetype            :
size                :



colour, fill, linetype           statusSource
                        fill = I("blue")        I   (AsIs)
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        alpha = as.integer(wday))
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        colour = statusSource)
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        fill = statusSource)
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        linetype = statusSource, colour = statusSource)
whotwi




         http://whotwi.com/
whotwi




         http://whotwi.com/
whotwi
>   #         Twitter
>   #       melt     cast               xtabs
> cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF))
> head(cnt, 3)
  hour wday statusSource Freq
1    0 Mon YoruFukurou     48
2    1 Mon YoruFukurou     38
3    2 Mon YoruFukurou     25
whotwi
>   #           Twitter
>   #         melt     cast               xtabs
>   cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF))
>   head(cnt, 3)
    hour wday statusSource Freq
1      0 Mon YoruFukurou     48
2      1 Mon YoruFukurou     38
3      2 Mon YoruFukurou     25
>   freqSources <- by(cnt, cnt[c("hour", "wday")], function(df) {
+      #
+      freqSource <- with(df, statusSource[order(Freq, decreasing = TRUE)
[1]])
+      cbind(df[1, c("hour", "wday")], freqSource)
+ })
> freqSources <- do.call(rbind, freqSources)
> head(freqSources, 3)
  hour wday freqSource
1     0 Mon YoruFukurou
2     1 Mon YoruFukurou
3     2 Mon YoruFukurou
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)
> p <- qplot(hour, wday, data = data, xlab = "", ylab = "",
+            geom = "point", colour = freqSource, size = Freq)
> p #             print(p)
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)
> p <- qplot(hour, wday, data = data, xlab = "", ylab = "",
+            geom = "point", colour = freqSource, size = Freq)
> p #             print(p)
whotwi
whotwi
whotwi
> # whotwi theme
> theme_whotwi <- function() {
+     opts( #
+          panel.background = theme_rect(fill = NA, colour = NA),
+           #
+          legend.key = theme_rect(fill = NA, colour = NA),
+           #
+          axis.ticks = theme_segment(colour = NA))
+ }
> p2 <- p + theme_whotwi() + scale_size(legend = FALSE) +
scale_colour_hue(name = "")
> p2
whotwi
> # whotwi theme
> theme_whotwi <- function() {
+     opts( #
+          panel.background = theme_rect(fill = NA, colour = NA),
+           #
+          legend.key = theme_rect(fill = NA, colour = NA),
+           #
+          axis.ticks = theme_segment(colour = NA))
+ }
> p2 <- p + theme_whotwi() + scale_size(legend = FALSE) +
scale_colour_hue(name = "")
> p2
whotwi
whotwi
whotwi
whotwi




         PC
whotwi



PC



         PC
Twitter


•
•             reshape2
•               ggplot2
•
TweetSentiments
TweetSentiments

R
1. RMeCab

2.

3.
RMeCab
    MeCab                      R

> library(RMeCab)
> (docDF(data.frame("                    "), column = 1, type = 1))
number of extracted terms = 5
now making a data frame. wait a while!

     TERM POS1   POS2 Row1
1                      1
2                       1
3                       1
4                       2
5                       2
http://www.lr.pi.titech.ac.jp/~takamura/pndic_ja.html
          :               :        :1
      :           :           :0.999995
      :               :           :0.999979
          :           :           :0.999979
              :               :         :0.999645
      :               :            :0.999486
      :           :           :0.999314
...
> #
> pndic <- read.table("http://www.lr.pi.titech.ac.jp/~takamura/pubs/
pn_ja.dic",
+                     sep = ":",
+                     col.names = c("term", "kana", "pos", "value"),
+                     colClasses = c("character", "character", "factor",
"numeric"),
+                     fileEncoding = "Shift_JIS")
> #
> #
> pndic2 <- aggregate(value ~ term + pos, pndic, mean)
> # pndic
> pos <- unique(pndic2$pos)
> tweetDF <- docDF(statusDF, column = "cleanText", type = 1, pos = pos)
number of extracted terms = 7164
now making a data frame. wait a while!

> tweetDF[2900:2904, 1:5]
         TERM   POS1 POS2 Row1 Row2
2900                      0    0
2901                         0       0
2902                     0       0
2903                         0       0
2904                         0       0
> # pndic
> tweetDF <- subset(tweetDF, TERM %in% pndic2$term)
> #
> tweetDF <- merge(tweetDF, pndic2, by.x = c("TERM", "POS1"), by.y = c
("term", "pos"))
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> sum(score == 0)
[1] 277
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> sum(score == 0)
[1] 277
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> sum(score == 0)
[1] 277
> table(ifelse(pndic$value > 0, "positive",
+              ifelse(pndic$value == 0, "neutral", "negative")))

negative   neutral positive
   49983        20     5122
> table(ifelse(pndic$value > 0, "positive",
+              ifelse(pndic$value == 0, "neutral", "negative")))

negative   neutral positive
   49983        20     5122
> m <- mean(score)
> #
> tweetType <- factor(ifelse(score > m, "positive",
+                     ifelse(score == m, "neutral", "negative")),
+                     levels = c("positive", "neutral", "negative"))
> table(tweetType)
tweetType
positive neutral negative
    1912        0     1247
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
twitteR
•                                 RJSONIO
•
•             ID   status ID
• fav               favorited   TRUE
• truncated   TRUE
• DM
• status
  character   factor
twitteR
•                                 RJSONIO
•
•             ID   status ID
• fav               favorited   TRUE
• truncated   TRUE
• DM
• status
  character   factor
OAuth   ”   ”   twitteR   -
• twitteR
• reshape2       R


• ggplot2


• RMeCab     R
• twitteR
• reshape2       R


• ggplot2


• RMeCab     R

• PC
•
https://github.com/abicky/rjpusers2011_abicky
status
> statuses[[1]]$text
[1] "                                    "
> statuses[[1]]$getText() #
[1] "                                    "
> #
> statuses[[1]]$text <- "                    "
> statuses[[1]]$getText()
[1] "                                "
> statuses[[1]]$setText("ggrks") #
> statuses[[1]]$getText()
[1] "ggrks"
> #
> statuses[[1]]$getCreated()
[1] "2011-11-23 22:16:24 UTC"
removeSpecialStr

removeSpecialStr <- function(text) {
    removeURL(removeHashTag(removeScreenName(text)))
}
removeScreenName

removeScreenName <- function(text, strict = TRUE) {
    if (strict) {
        regex <- "(?<!w)[@ ](?>w+)(?![@ ])"
    } else {
        regex <- "[@   ]w+"
    }
    gsub(regex, "", text, perl = TRUE)
}
removeURL

removeURL <- function(text, strict = TRUE) {
    if (strict) {
        regex <- "(?<![-.w#@=!'"/])https?://(?:[^:]+:.
+@)?(?:[0-9A-Za-z][-0-9A-Za-z]*(?<!-).)+[A-za-z]+(?:/[-
w#%=+,.?!&~]*)*"
    } else {
        regex <- "https?://[-w#%=+,.?!&~/]+"
    }
    gsub(regex, "", text, perl = TRUE)
}
removeHashTag

removeHashTag <- function(text, strict = TRUE) {
    delimiters <- "s,.u3000-u3002uFF01uFF1F"
    # cf. http://nobu666.com/2011/07/13/914.html
    validJa <- "u3041-u3094u3099-u309Cu30A1-u30FA
u30FCu3400-uD7A3uFF10-uFF19uFF21-uFF3AuFF41-uFF5A
uFF66-uFF9E"
    if (strict) {
        regex <- sprintf("(^|[%s])(?:([# ](?>[0-9]+)(?!
w))|[#   ][w%s]+)", delimiters, validJa, validJa)
    } else {
        regex <- sprintf("[#   ][^%s]+", delimiters)
    }
    gsub(regex, "12", text, perl = TRUE)
}

More Related Content

What's hot

Is Haskell an acceptable Perl?
Is Haskell an acceptable Perl?Is Haskell an acceptable Perl?
Is Haskell an acceptable Perl?osfameron
 
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPython
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPythonByterun, a Python bytecode interpreter - Allison Kaptur at NYCPython
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPythonakaptur
 
Clustering com numpy e cython
Clustering com numpy e cythonClustering com numpy e cython
Clustering com numpy e cythonAnderson Dantas
 
Data monsters probablistic data structures
Data monsters probablistic data structuresData monsters probablistic data structures
Data monsters probablistic data structuresGreenM
 
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...akaptur
 
dplyr and torrents from cpasbien
dplyr and torrents from cpasbiendplyr and torrents from cpasbien
dplyr and torrents from cpasbienRomain Francois
 
The groovy puzzlers (as Presented at JavaOne 2014)
The groovy puzzlers (as Presented at JavaOne 2014)The groovy puzzlers (as Presented at JavaOne 2014)
The groovy puzzlers (as Presented at JavaOne 2014)GroovyPuzzlers
 
M12 random forest-part01
M12 random forest-part01M12 random forest-part01
M12 random forest-part01Raman Kannan
 
Bytes in the Machine: Inside the CPython interpreter
Bytes in the Machine: Inside the CPython interpreterBytes in the Machine: Inside the CPython interpreter
Bytes in the Machine: Inside the CPython interpreterakaptur
 
Spark DataFrames for Data Munging
Spark DataFrames for Data MungingSpark DataFrames for Data Munging
Spark DataFrames for Data Munging(Susan) Xinh Huynh
 
M09-Cross validating-naive-bayes
M09-Cross validating-naive-bayesM09-Cross validating-naive-bayes
M09-Cross validating-naive-bayesRaman Kannan
 
第二讲 Python基礎
第二讲 Python基礎第二讲 Python基礎
第二讲 Python基礎juzihua1102
 
第二讲 预备-Python基礎
第二讲 预备-Python基礎第二讲 预备-Python基礎
第二讲 预备-Python基礎anzhong70
 
"A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!...
"A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!..."A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!...
"A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!...akaptur
 
(gentle (introduction Clojure))
(gentle (introduction Clojure))(gentle (introduction Clojure))
(gentle (introduction Clojure))Guy Taylor
 
An Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackAn Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackVic Metcalfe
 
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPlotly
 
Top 10 php classic traps
Top 10 php classic trapsTop 10 php classic traps
Top 10 php classic trapsDamien Seguy
 
MongoUK - PHP Development
MongoUK - PHP DevelopmentMongoUK - PHP Development
MongoUK - PHP DevelopmentBoxed Ice
 

What's hot (20)

Is Haskell an acceptable Perl?
Is Haskell an acceptable Perl?Is Haskell an acceptable Perl?
Is Haskell an acceptable Perl?
 
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPython
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPythonByterun, a Python bytecode interpreter - Allison Kaptur at NYCPython
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPython
 
Clustering com numpy e cython
Clustering com numpy e cythonClustering com numpy e cython
Clustering com numpy e cython
 
Data monsters probablistic data structures
Data monsters probablistic data structuresData monsters probablistic data structures
Data monsters probablistic data structures
 
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...
 
dplyr and torrents from cpasbien
dplyr and torrents from cpasbiendplyr and torrents from cpasbien
dplyr and torrents from cpasbien
 
The groovy puzzlers (as Presented at JavaOne 2014)
The groovy puzzlers (as Presented at JavaOne 2014)The groovy puzzlers (as Presented at JavaOne 2014)
The groovy puzzlers (as Presented at JavaOne 2014)
 
Malcon2017
Malcon2017Malcon2017
Malcon2017
 
M12 random forest-part01
M12 random forest-part01M12 random forest-part01
M12 random forest-part01
 
Bytes in the Machine: Inside the CPython interpreter
Bytes in the Machine: Inside the CPython interpreterBytes in the Machine: Inside the CPython interpreter
Bytes in the Machine: Inside the CPython interpreter
 
Spark DataFrames for Data Munging
Spark DataFrames for Data MungingSpark DataFrames for Data Munging
Spark DataFrames for Data Munging
 
M09-Cross validating-naive-bayes
M09-Cross validating-naive-bayesM09-Cross validating-naive-bayes
M09-Cross validating-naive-bayes
 
第二讲 Python基礎
第二讲 Python基礎第二讲 Python基礎
第二讲 Python基礎
 
第二讲 预备-Python基礎
第二讲 预备-Python基礎第二讲 预备-Python基礎
第二讲 预备-Python基礎
 
"A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!...
"A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!..."A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!...
"A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!...
 
(gentle (introduction Clojure))
(gentle (introduction Clojure))(gentle (introduction Clojure))
(gentle (introduction Clojure))
 
An Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackAn Elephant of a Different Colour: Hack
An Elephant of a Different Colour: Hack
 
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
 
Top 10 php classic traps
Top 10 php classic trapsTop 10 php classic traps
Top 10 php classic traps
 
MongoUK - PHP Development
MongoUK - PHP DevelopmentMongoUK - PHP Development
MongoUK - PHP Development
 

Viewers also liked

Rによるテキストマイニングの一例
Rによるテキストマイニングの一例Rによるテキストマイニングの一例
Rによるテキストマイニングの一例LINE Corp.
 
TwitterのデータをRであれこれ
TwitterのデータをRであれこれTwitterのデータをRであれこれ
TwitterのデータをRであれこれTakeshi Arabiki
 
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~antibayesian 俺がS式だ
 
RでTwitterテキストマイニング
RでTwitterテキストマイニングRでTwitterテキストマイニング
RでTwitterテキストマイニングYudai Shinbo
 
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜Keiichiro Ono
 
さくっとはじめるテキストマイニング(R言語)  スタートアップ編
さくっとはじめるテキストマイニング(R言語)  スタートアップ編さくっとはじめるテキストマイニング(R言語)  スタートアップ編
さくっとはじめるテキストマイニング(R言語)  スタートアップ編Yutaka Shimada
 
動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~
動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~
動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~Yusuke Fukasawa
 
経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析
経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析
経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析Yusuke Fukasawa
 
【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみた
【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみた【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみた
【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみたYusuke Fukasawa
 
料理レシピサービスにおける検索語の意味変化に関する分析
料理レシピサービスにおける検索語の意味変化に関する分析料理レシピサービスにおける検索語の意味変化に関する分析
料理レシピサービスにおける検索語の意味変化に関する分析Yusuke Fukasawa
 
DeNAの報告書を可視化して雰囲気をつかむ
DeNAの報告書を可視化して雰囲気をつかむDeNAの報告書を可視化して雰囲気をつかむ
DeNAの報告書を可視化して雰囲気をつかむYusuke Fukasawa
 
ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握
ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握
ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握Yusuke Fukasawa
 
巨大な表を高速に扱うData.table について
巨大な表を高速に扱うData.table について巨大な表を高速に扱うData.table について
巨大な表を高速に扱うData.table についてHaruka Ozaki
 
data.tableパッケージで大規模データをサクッと処理する
data.tableパッケージで大規模データをサクッと処理するdata.tableパッケージで大規模データをサクッと処理する
data.tableパッケージで大規模データをサクッと処理するShintaro Fukushima
 
LDA等のトピックモデル
LDA等のトピックモデルLDA等のトピックモデル
LDA等のトピックモデルMathieu Bertin
 
トピックモデルの話
トピックモデルの話トピックモデルの話
トピックモデルの話kogecoo
 

Viewers also liked (16)

Rによるテキストマイニングの一例
Rによるテキストマイニングの一例Rによるテキストマイニングの一例
Rによるテキストマイニングの一例
 
TwitterのデータをRであれこれ
TwitterのデータをRであれこれTwitterのデータをRであれこれ
TwitterのデータをRであれこれ
 
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~
 
RでTwitterテキストマイニング
RでTwitterテキストマイニングRでTwitterテキストマイニング
RでTwitterテキストマイニング
 
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜
 
さくっとはじめるテキストマイニング(R言語)  スタートアップ編
さくっとはじめるテキストマイニング(R言語)  スタートアップ編さくっとはじめるテキストマイニング(R言語)  スタートアップ編
さくっとはじめるテキストマイニング(R言語)  スタートアップ編
 
動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~
動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~
動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~
 
経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析
経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析
経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析
 
【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみた
【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみた【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみた
【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみた
 
料理レシピサービスにおける検索語の意味変化に関する分析
料理レシピサービスにおける検索語の意味変化に関する分析料理レシピサービスにおける検索語の意味変化に関する分析
料理レシピサービスにおける検索語の意味変化に関する分析
 
DeNAの報告書を可視化して雰囲気をつかむ
DeNAの報告書を可視化して雰囲気をつかむDeNAの報告書を可視化して雰囲気をつかむ
DeNAの報告書を可視化して雰囲気をつかむ
 
ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握
ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握
ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握
 
巨大な表を高速に扱うData.table について
巨大な表を高速に扱うData.table について巨大な表を高速に扱うData.table について
巨大な表を高速に扱うData.table について
 
data.tableパッケージで大規模データをサクッと処理する
data.tableパッケージで大規模データをサクッと処理するdata.tableパッケージで大規模データをサクッと処理する
data.tableパッケージで大規模データをサクッと処理する
 
LDA等のトピックモデル
LDA等のトピックモデルLDA等のトピックモデル
LDA等のトピックモデル
 
トピックモデルの話
トピックモデルの話トピックモデルの話
トピックモデルの話
 

Similar to RではじめるTwitter解析

Just in time (series) - KairosDB
Just in time (series) - KairosDBJust in time (series) - KairosDB
Just in time (series) - KairosDBVictor Anjos
 
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」Ken'ichi Matsui
 
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)Grace Yang
 
[1062BPY12001] Data analysis with R / week 2
[1062BPY12001] Data analysis with R / week 2[1062BPY12001] Data analysis with R / week 2
[1062BPY12001] Data analysis with R / week 2Kevin Chun-Hsien Hsu
 
Beyond PHP - it's not (just) about the code
Beyond PHP - it's not (just) about the codeBeyond PHP - it's not (just) about the code
Beyond PHP - it's not (just) about the codeWim Godden
 
Datamining r 1st
Datamining r 1stDatamining r 1st
Datamining r 1stsesejun
 
Functional programming in Swift
Functional programming in SwiftFunctional programming in Swift
Functional programming in SwiftJohn Pham
 
PRE: Datamining 2nd R
PRE: Datamining 2nd RPRE: Datamining 2nd R
PRE: Datamining 2nd Rsesejun
 
Datamining R 1st
Datamining R 1stDatamining R 1st
Datamining R 1stsesejun
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with ClojureDmitry Buzdin
 
Table of Useful R commands.
Table of Useful R commands.Table of Useful R commands.
Table of Useful R commands.Dr. Volkan OBAN
 
Gotcha! Ruby things that will come back to bite you.
Gotcha! Ruby things that will come back to bite you.Gotcha! Ruby things that will come back to bite you.
Gotcha! Ruby things that will come back to bite you.David Tollmyr
 
Text Mining of Twitter in Data Mining
Text Mining of Twitter in Data MiningText Mining of Twitter in Data Mining
Text Mining of Twitter in Data MiningMeghaj Mallick
 
R Programming: Export/Output Data In R
R Programming: Export/Output Data In RR Programming: Export/Output Data In R
R Programming: Export/Output Data In RRsquared Academy
 
第6回 関数とフロー制御
第6回 関数とフロー制御第6回 関数とフロー制御
第6回 関数とフロー制御Wataru Shito
 

Similar to RではじめるTwitter解析 (20)

Just in time (series) - KairosDB
Just in time (series) - KairosDBJust in time (series) - KairosDB
Just in time (series) - KairosDB
 
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」
 
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)
 
[1062BPY12001] Data analysis with R / week 2
[1062BPY12001] Data analysis with R / week 2[1062BPY12001] Data analysis with R / week 2
[1062BPY12001] Data analysis with R / week 2
 
Beyond PHP - it's not (just) about the code
Beyond PHP - it's not (just) about the codeBeyond PHP - it's not (just) about the code
Beyond PHP - it's not (just) about the code
 
Datamining r 1st
Datamining r 1stDatamining r 1st
Datamining r 1st
 
dplyr
dplyrdplyr
dplyr
 
Functional programming in Swift
Functional programming in SwiftFunctional programming in Swift
Functional programming in Swift
 
PRE: Datamining 2nd R
PRE: Datamining 2nd RPRE: Datamining 2nd R
PRE: Datamining 2nd R
 
Datamining R 1st
Datamining R 1stDatamining R 1st
Datamining R 1st
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
 
Table of Useful R commands.
Table of Useful R commands.Table of Useful R commands.
Table of Useful R commands.
 
Into Clojure
Into ClojureInto Clojure
Into Clojure
 
Spark_Documentation_Template1
Spark_Documentation_Template1Spark_Documentation_Template1
Spark_Documentation_Template1
 
R programming language
R programming languageR programming language
R programming language
 
Slides ads ia
Slides ads iaSlides ads ia
Slides ads ia
 
Gotcha! Ruby things that will come back to bite you.
Gotcha! Ruby things that will come back to bite you.Gotcha! Ruby things that will come back to bite you.
Gotcha! Ruby things that will come back to bite you.
 
Text Mining of Twitter in Data Mining
Text Mining of Twitter in Data MiningText Mining of Twitter in Data Mining
Text Mining of Twitter in Data Mining
 
R Programming: Export/Output Data In R
R Programming: Export/Output Data In RR Programming: Export/Output Data In R
R Programming: Export/Output Data In R
 
第6回 関数とフロー制御
第6回 関数とフロー制御第6回 関数とフロー制御
第6回 関数とフロー制御
 

More from Takeshi Arabiki

クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜
クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜
クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜Takeshi Arabiki
 
Introduction to Japanese Morphological Analysis
Introduction to Japanese Morphological AnalysisIntroduction to Japanese Morphological Analysis
Introduction to Japanese Morphological AnalysisTakeshi Arabiki
 
R による文書分類入門
R による文書分類入門R による文書分類入門
R による文書分類入門Takeshi Arabiki
 
Rのデータ構造とメモリ管理
Rのデータ構造とメモリ管理Rのデータ構造とメモリ管理
Rのデータ構造とメモリ管理Takeshi Arabiki
 
HTML5 Canvas で学ぶアフィン変換
HTML5 Canvas で学ぶアフィン変換HTML5 Canvas で学ぶアフィン変換
HTML5 Canvas で学ぶアフィン変換Takeshi Arabiki
 
Introduction to Favmemo for Immature Engineers
Introduction to Favmemo for Immature EngineersIntroduction to Favmemo for Immature Engineers
Introduction to Favmemo for Immature EngineersTakeshi Arabiki
 
Rのスコープとフレームと環境と
Rのスコープとフレームと環境とRのスコープとフレームと環境と
Rのスコープとフレームと環境とTakeshi Arabiki
 
R版Getopt::Longを作ってみた
R版Getopt::Longを作ってみたR版Getopt::Longを作ってみた
R版Getopt::Longを作ってみたTakeshi Arabiki
 
Rデータフレーム自由自在
Rデータフレーム自由自在Rデータフレーム自由自在
Rデータフレーム自由自在Takeshi Arabiki
 
文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜
文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜
文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜Takeshi Arabiki
 
Rデバッグあれこれ
RデバッグあれこれRデバッグあれこれ
RデバッグあれこれTakeshi Arabiki
 
はじめてのまっぷりでゅ〜す
はじめてのまっぷりでゅ〜すはじめてのまっぷりでゅ〜す
はじめてのまっぷりでゅ〜すTakeshi Arabiki
 
Twitterのデータを取得する準備
Twitterのデータを取得する準備Twitterのデータを取得する準備
Twitterのデータを取得する準備Takeshi Arabiki
 

More from Takeshi Arabiki (15)

開発の心得
開発の心得開発の心得
開発の心得
 
クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜
クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜
クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜
 
Introduction to Japanese Morphological Analysis
Introduction to Japanese Morphological AnalysisIntroduction to Japanese Morphological Analysis
Introduction to Japanese Morphological Analysis
 
R による文書分類入門
R による文書分類入門R による文書分類入門
R による文書分類入門
 
Rのデータ構造とメモリ管理
Rのデータ構造とメモリ管理Rのデータ構造とメモリ管理
Rのデータ構造とメモリ管理
 
HTML5 Canvas で学ぶアフィン変換
HTML5 Canvas で学ぶアフィン変換HTML5 Canvas で学ぶアフィン変換
HTML5 Canvas で学ぶアフィン変換
 
Introduction to Favmemo for Immature Engineers
Introduction to Favmemo for Immature EngineersIntroduction to Favmemo for Immature Engineers
Introduction to Favmemo for Immature Engineers
 
Rのスコープとフレームと環境と
Rのスコープとフレームと環境とRのスコープとフレームと環境と
Rのスコープとフレームと環境と
 
R版Getopt::Longを作ってみた
R版Getopt::Longを作ってみたR版Getopt::Longを作ってみた
R版Getopt::Longを作ってみた
 
Rデータフレーム自由自在
Rデータフレーム自由自在Rデータフレーム自由自在
Rデータフレーム自由自在
 
HMM, MEMM, CRF メモ
HMM, MEMM, CRF メモHMM, MEMM, CRF メモ
HMM, MEMM, CRF メモ
 
文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜
文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜
文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜
 
Rデバッグあれこれ
RデバッグあれこれRデバッグあれこれ
Rデバッグあれこれ
 
はじめてのまっぷりでゅ〜す
はじめてのまっぷりでゅ〜すはじめてのまっぷりでゅ〜す
はじめてのまっぷりでゅ〜す
 
Twitterのデータを取得する準備
Twitterのデータを取得する準備Twitterのデータを取得する準備
Twitterのデータを取得する準備
 

Recently uploaded

Generative AI - Gitex v1Generative AI - Gitex v1.pptx
Generative AI - Gitex v1Generative AI - Gitex v1.pptxGenerative AI - Gitex v1Generative AI - Gitex v1.pptx
Generative AI - Gitex v1Generative AI - Gitex v1.pptxfnnc6jmgwh
 
Top 10 Hubspot Development Companies in 2024
Top 10 Hubspot Development Companies in 2024Top 10 Hubspot Development Companies in 2024
Top 10 Hubspot Development Companies in 2024TopCSSGallery
 
Landscape Catalogue 2024 Australia-1.pdf
Landscape Catalogue 2024 Australia-1.pdfLandscape Catalogue 2024 Australia-1.pdf
Landscape Catalogue 2024 Australia-1.pdfAarwolf Industries LLC
 
Use of FIDO in the Payments and Identity Landscape: FIDO Paris Seminar.pptx
Use of FIDO in the Payments and Identity Landscape: FIDO Paris Seminar.pptxUse of FIDO in the Payments and Identity Landscape: FIDO Paris Seminar.pptx
Use of FIDO in the Payments and Identity Landscape: FIDO Paris Seminar.pptxLoriGlavin3
 
Decarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityDecarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityIES VE
 
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxThe Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxLoriGlavin3
 
Generative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdfGenerative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdfIngrid Airi González
 
So einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdfSo einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdfpanagenda
 
Glenn Lazarus- Why Your Observability Strategy Needs Security Observability
Glenn Lazarus- Why Your Observability Strategy Needs Security ObservabilityGlenn Lazarus- Why Your Observability Strategy Needs Security Observability
Glenn Lazarus- Why Your Observability Strategy Needs Security Observabilityitnewsafrica
 
A Framework for Development in the AI Age
A Framework for Development in the AI AgeA Framework for Development in the AI Age
A Framework for Development in the AI AgeCprime
 
A Journey Into the Emotions of Software Developers
A Journey Into the Emotions of Software DevelopersA Journey Into the Emotions of Software Developers
A Journey Into the Emotions of Software DevelopersNicole Novielli
 
The State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxThe State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxLoriGlavin3
 
Microservices, Docker deploy and Microservices source code in C#
Microservices, Docker deploy and Microservices source code in C#Microservices, Docker deploy and Microservices source code in C#
Microservices, Docker deploy and Microservices source code in C#Karmanjay Verma
 
Potential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and InsightsPotential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and InsightsRavi Sanghani
 
Email Marketing Automation for Bonterra Impact Management (fka Social Solutio...
Email Marketing Automation for Bonterra Impact Management (fka Social Solutio...Email Marketing Automation for Bonterra Impact Management (fka Social Solutio...
Email Marketing Automation for Bonterra Impact Management (fka Social Solutio...Jeffrey Haguewood
 
Scale your database traffic with Read & Write split using MySQL Router
Scale your database traffic with Read & Write split using MySQL RouterScale your database traffic with Read & Write split using MySQL Router
Scale your database traffic with Read & Write split using MySQL RouterMydbops
 
2024 April Patch Tuesday
2024 April Patch Tuesday2024 April Patch Tuesday
2024 April Patch TuesdayIvanti
 
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24Mark Goldstein
 
Transcript: New from BookNet Canada for 2024: BNC SalesData and LibraryData -...
Transcript: New from BookNet Canada for 2024: BNC SalesData and LibraryData -...Transcript: New from BookNet Canada for 2024: BNC SalesData and LibraryData -...
Transcript: New from BookNet Canada for 2024: BNC SalesData and LibraryData -...BookNet Canada
 
Zeshan Sattar- Assessing the skill requirements and industry expectations for...
Zeshan Sattar- Assessing the skill requirements and industry expectations for...Zeshan Sattar- Assessing the skill requirements and industry expectations for...
Zeshan Sattar- Assessing the skill requirements and industry expectations for...itnewsafrica
 

Recently uploaded (20)

Generative AI - Gitex v1Generative AI - Gitex v1.pptx
Generative AI - Gitex v1Generative AI - Gitex v1.pptxGenerative AI - Gitex v1Generative AI - Gitex v1.pptx
Generative AI - Gitex v1Generative AI - Gitex v1.pptx
 
Top 10 Hubspot Development Companies in 2024
Top 10 Hubspot Development Companies in 2024Top 10 Hubspot Development Companies in 2024
Top 10 Hubspot Development Companies in 2024
 
Landscape Catalogue 2024 Australia-1.pdf
Landscape Catalogue 2024 Australia-1.pdfLandscape Catalogue 2024 Australia-1.pdf
Landscape Catalogue 2024 Australia-1.pdf
 
Use of FIDO in the Payments and Identity Landscape: FIDO Paris Seminar.pptx
Use of FIDO in the Payments and Identity Landscape: FIDO Paris Seminar.pptxUse of FIDO in the Payments and Identity Landscape: FIDO Paris Seminar.pptx
Use of FIDO in the Payments and Identity Landscape: FIDO Paris Seminar.pptx
 
Decarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityDecarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a reality
 
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxThe Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
 
Generative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdfGenerative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdf
 
So einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdfSo einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdf
 
Glenn Lazarus- Why Your Observability Strategy Needs Security Observability
Glenn Lazarus- Why Your Observability Strategy Needs Security ObservabilityGlenn Lazarus- Why Your Observability Strategy Needs Security Observability
Glenn Lazarus- Why Your Observability Strategy Needs Security Observability
 
A Framework for Development in the AI Age
A Framework for Development in the AI AgeA Framework for Development in the AI Age
A Framework for Development in the AI Age
 
A Journey Into the Emotions of Software Developers
A Journey Into the Emotions of Software DevelopersA Journey Into the Emotions of Software Developers
A Journey Into the Emotions of Software Developers
 
The State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxThe State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptx
 
Microservices, Docker deploy and Microservices source code in C#
Microservices, Docker deploy and Microservices source code in C#Microservices, Docker deploy and Microservices source code in C#
Microservices, Docker deploy and Microservices source code in C#
 
Potential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and InsightsPotential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and Insights
 
Email Marketing Automation for Bonterra Impact Management (fka Social Solutio...
Email Marketing Automation for Bonterra Impact Management (fka Social Solutio...Email Marketing Automation for Bonterra Impact Management (fka Social Solutio...
Email Marketing Automation for Bonterra Impact Management (fka Social Solutio...
 
Scale your database traffic with Read & Write split using MySQL Router
Scale your database traffic with Read & Write split using MySQL RouterScale your database traffic with Read & Write split using MySQL Router
Scale your database traffic with Read & Write split using MySQL Router
 
2024 April Patch Tuesday
2024 April Patch Tuesday2024 April Patch Tuesday
2024 April Patch Tuesday
 
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24
 
Transcript: New from BookNet Canada for 2024: BNC SalesData and LibraryData -...
Transcript: New from BookNet Canada for 2024: BNC SalesData and LibraryData -...Transcript: New from BookNet Canada for 2024: BNC SalesData and LibraryData -...
Transcript: New from BookNet Canada for 2024: BNC SalesData and LibraryData -...
 
Zeshan Sattar- Assessing the skill requirements and industry expectations for...
Zeshan Sattar- Assessing the skill requirements and industry expectations for...Zeshan Sattar- Assessing the skill requirements and industry expectations for...
Zeshan Sattar- Assessing the skill requirements and industry expectations for...
 

RではじめるTwitter解析

  • 1. R Twitter R 2011 (2011/11/26) @a_bicky
  • 2. • Takeshi Arabiki ‣ ‣ Twitter & : @a_bicky & id:a_bicky • R • http://d.hatena.ne.jp/a_bicky/
  • 3. R Osaka.R #4 Tokyo.R #16 Tsukuba.R #9 http://www.slideshare.net/abicky/twitterr http://www.slideshare.net/abicky/r-9034336 http://www.slideshare.net/abicky/r-10128090
  • 4.
  • 5.
  • 10. http://twilog.org/ http://twitraq.userlocal.jp/ http://whotwi.com/ http://tweetstats.com/
  • 11. http://twilog.org/ http://twitraq.userlocal.jp/ R http://whotwi.com/ http://tweetstats.com/
  • 12. Twitter • • reshape2 • ggplot2 •
  • 13. Twitter • • reshape2 • ggplot2 •
  • 14.
  • 15. twitteR twitteR > library(twitteR) # twitteR > # (twitteR 0.99.15 ) > Sys.setlocale("LC_TIME", "C") [1] "C" > # @a_bicky 3,200 RT > statuses <- userTimeline("a_bicky", n = 3200)
  • 16. status > # R5 > ls.str(statuses[[1]]) created : POSIXct[1:1], format: "2011-11-23 22:16:24" favorited : logi FALSE ↑ UTC id : chr "139467359571296256" initFields : Formal class 'refMethodDef' [package "methods"] with 5 slots initialize : Formal class 'refMethodDef' [package "methods"] with 5 slots replyToSID : chr(0) replyToSN : chr(0) replyToUID : chr(0) screenName : chr "a_bicky" ! Twitter statusSource : chr "<a href="http://sites.google.com/site/ yorufukurou/" rel="nofollow">YoruFukurou</a>" text : chr " " truncated : logi FALSE ↑
  • 17. > statusDF <- twListToDF(statuses) > str(statusDF, vec.len = 1) 'data.frame': 3159 obs. of 10 variables: $ text : chr " " ... ↑ $ favorited : logi FALSE ... $ replyToSN : logi NA ... $ created : POSIXct, format: "2011-11-23 22:16:24" ... $ truncated : logi FALSE ... ↑ UTC $ replyToSID : logi NA ... $ id : chr "139467359571296256" ... $ replyToUID : logi NA ... $ statusSource: chr "<a href="http://sites.google.com/ site/yorufukurou/" rel="nofollow">YoruFukurou</a>" ... $ screenName : chr "a_bicky" ...
  • 18. > wday.abb <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun") > statusDF <- within(statusDF, { + attr(created, "tzone") <- "Asia/Tokyo" # JST + statusSource <- factor(gsub("<a .*?>(.*?)</a>", "1", statusSource)) # HTML + date <- factor(format(created, "%Y-%m-%d")) # + hour <- NULL; month <- NULL; year <- NULL; wday <- NULL + with(as.POSIXlt(created), { + hour <<- factor(hour) # + month <<- factor(mon + 1) # + year <<- factor(year + 1900) # + wday <<- factor((wday + 6) %% 7, labels = wday.abb) # + }) + textLength <- nchar(text) # + # , URL, + cleanText <- removeSpecialStr(text) + cleanTextLength <- nchar(cleanText) # URL + })
  • 19. > # Twitter > topSources <- names(head(sort(table(statusDF$statusSource), decreasing = TRUE), 5)) > statusDF <- within(statusDF, { + statusSource <- as.character(statusSource) + statusSource[!statusSource %in% topSources] <- "other" + # + statusSource <- factor(statusSource, levels = names(sort(table (statusSource), dec = TRUE))) + })
  • 20. Twitter • • reshape2 • ggplot2 •
  • 22.
  • 23. Excel 9 11 ”Twitter for iPhone”, ”YoruFukurou” Sat Mon 12 23
  • 24. reshape2 > library(reshape2) > acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"), + measure.vars = c("textLength")), + month + statusSource ~ wday, mean, + subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone") + & month %in% 9:11 & hour %in% 12:23 + & wday %in% c("Mon", "Sat", "Sun"))) Mon Sat Sun 9_YoruFukurou 43 42.13333 54.76471 9_Twitter for iPhone 16 27.70000 20.50000 10_YoruFukurou 61 41.70175 56.98333 10_Twitter for iPhone NaN 27.00000 24.50000 11_YoruFukurou 35 41.08197 57.32609 11_Twitter for iPhone NaN NaN 32.00000
  • 25. reshape2 > library(reshape2) > acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"), + measure.vars = c("textLength")), + month + statusSource ~ wday, mean, + subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone") + & month %in% 9:11 & hour %in% 12:23 + & wday %in% c("Mon", "Sat", "Sun"))) Mon Sat Sun 9_YoruFukurou 43 42.13333 54.76471 9_Twitter for iPhone 16 27.70000 20.50000 10_YoruFukurou 61 41.70175 56.98333 10_Twitter for iPhone NaN 27.00000 24.50000 11_YoruFukurou 35 41.08197 57.32609 11_Twitter for iPhone NaN NaN 32.00000
  • 26. reshape2 > library(reshape2) > acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"), + measure.vars = c("textLength")), + month + statusSource ~ wday, mean, + subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone") + & month %in% 9:11 & hour %in% 12:23 + & wday %in% c("Mon", "Sat", "Sun"))) Mon Sat Sun 9_YoruFukurou 43 42.13333 54.76471 9_Twitter for iPhone 16 27.70000 20.50000 10_YoruFukurou 61 41.70175 56.98333 10_Twitter for iPhone NaN 27.00000 24.50000 11_YoruFukurou 35 41.08197 57.32609 11_Twitter for iPhone NaN NaN 32.00000 R
  • 27. reshape2 melt melt cast melt cast > mstatus <- melt(statusDF, + id.vars = c("statusSource", "wday", "year", "month", "hour", "date"), + measure.vars = c("textLength", "cleanTextLength")) > mstatus[3157:3162, ] statusSource wday year month hour date variable value 3157 web Sun 2011 3 20 2011-03-13 textLength 72 3158 web Sun 2011 3 16 2011-03-13 textLength 24 3159 web Sun 2011 3 14 2011-03-13 textLength 82 3160 YoruFukurou Wed 2011 11 1 2011-11-23 cleanTextLength 87 3161 YoruFukurou Wed 2011 11 1 2011-11-23 cleanTextLength 14 3162 YoruFukurou Wed 2011 11 1 2011-11-23 cleanTextLength 21 id
  • 28. reshape2 cast cast formula fun.aggregate > args(acast) # array acast function (data, formula, fun.aggregate = NULL, ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data)) NULL > args(dcast) # data.frame dcast function (data, formula, fun.aggregate = NULL, ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data)) NULL formula ... . acast hoge ~ fuga ~ piyo ※dcast 1 hoge ~ fuga + piyo
  • 29. > # > acast(mstatus, . ~ wday, length, subset = .(variable == "textLength")) ↑ cleanTextLength
  • 30. > # > acast(mstatus, . ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 >
  • 31. > # > acast(mstatus, . ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 >
  • 32. > # > acast(mstatus, . ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 > # > acast(mstatus, hour ~ wday, length, subset = .(variable == "textLength"))
  • 33. > # > acast(mstatus, . ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 > # > acast(mstatus, hour ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun 0 65 69 26 46 46 49 40 1 48 19 11 15 27 44 37 2 31 24 6 16 17 23 17 3 27 19 4 11 14 17 10 4 4 15 1 7 4 5 7 5 5 11 1 4 3 4 5 6 4 14 3 6 9 8 1
  • 34. > # > # > acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength"))
  • 35. > # > # > acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0
  • 36. > # > # > acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0 > # 3 > acast(mstatus, hour ~ wday ~ month, length, subset = .(variable == "textLength"))
  • 37. > # > # > acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0 > # 3 > acast(mstatus, hour ~ wday ~ month, length, subset = .(variable == "textLength")) , , 3 Mon Tue Wed Thu Fri Sat Sun 0 3 4 1 0 1 6 4 1 0 1 3 0 0 0 1
  • 38. > # > # > acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0 > # 3 > acast(mstatus, hour ~ wday ~ month, length, subset = .(variable == "textLength")) , , 3 3 Mon Tue Wed Thu Fri Sat Sun 0 3 4 1 0 1 6 4 1 0 1 3 0 0 0 1
  • 39.
  • 40.
  • 41.
  • 42. Twitter reshape2 1 > # > dcast(mstatus, statusSource ~ ., + function(x) list(c(mean = mean(x), sd = sd(x))), + fill = list(c(mean = NaN, sd = NA)), ← + subset = .(variable == "textLength"))
  • 43. Twitter reshape2 1 > # > dcast(mstatus, statusSource ~ ., + function(x) list(c(mean = mean(x), sd = sd(x))), + fill = list(c(mean = NaN, sd = NA)), ← + subset = .(variable == "textLength")) statusSource NA 1 YoruFukurou 47.51462, 32.57973 2 web 57.02720, 36.33534 3 Twitter for iPhone 33.42342, 23.06466 4 Twitter for Android 28.49048, 20.08457 5 Hatena 80.00000, 25.94212 6 other 52.58621, 33.12180 >
  • 44. Twitter reshape2 1 > # > dcast(mstatus, statusSource ~ ., + function(x) list(c(mean = mean(x), sd = sd(x))), + fill = list(c(mean = NaN, sd = NA)), ← + subset = .(variable == "textLength")) statusSource NA 1 YoruFukurou 47.51462, 32.57973 2 web 57.02720, 36.33534 3 Twitter for iPhone 33.42342, 23.06466 4 Twitter for Android 28.49048, 20.08457 5 Hatena 80.00000, 25.94212 6 other 52.58621, 33.12180 >
  • 45. > # t > pc <- unlist(subset(statusDF, + statusSource %in% c("YoruFukurou", "web"), + textLength)) > sp <- unlist(subset(statusDF, + grepl("(iPhone|Android)", statusSource), + textLength)) > t.test(sp, pc, var.equal = FALSE) Welch Two Sample t-test !! data: sp and pc t = -15.7921, df = 1588.246, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -19.85334 -15.46645 sample estimates: mean of x mean of y 31.83945 49.49935
  • 46. > # t > pc <- unlist(subset(statusDF, + statusSource %in% c("YoruFukurou", "web"), + textLength)) > sp <- unlist(subset(statusDF, + grepl("(iPhone|Android)", statusSource), + textLength)) > t.test(sp, pc, var.equal = FALSE) Welch Two Sample t-test !! data: sp and pc t = -15.7921, df = 1588.246, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -19.85334 -15.46645 sample estimates: mean of x mean of y 31.83945 49.49935
  • 47. > extractScreenNames <- function(text, strict = TRUE) { + if (strict) { + # Twitter screen_name + regex <- "(?:(?<!w)([@ ])((?>w+))(?![@ ])|[sS])" + } else { + # hoge@example.com + regex <- "(?:([@ ])(w+)|[sS])" + } + screenNames <- gsub(regex, "12", text, perl = TRUE) + unique(unlist(strsplit(substring(screenNames, 2), "[@ ]"))) + } > screenNames <- unlist(lapply(statusDF$text, extractScreenNames)) > head(sort(table(screenNames), decreasing = TRUE), 10) # Top 10 screenNames naopr __gfx__ hirota_inoue mandy_44 ask_a_lie 105 85 51 47 40 ken_nishi nokuno yokkuns JinJin0613 kanon19_rie 39 39 33 20 20
  • 48. Twitter • • reshape2 • ggplot2 •
  • 50. ggplot2 plot(statusDF$wday, col = "blue") ggplot2 qplot(wday, data = statusDF, fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "")
  • 51. ggplot2 qplot(wday, data = statusDF, fill = statusSource, xlab = "", ylab = "")
  • 52. ggplot2 qplot(wday, data = statusDF, facets = ~ statusSource, fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "") qplot(wday, data = statusDF, fill = statusSource, xlab = "", ylab = "")
  • 53. ggplot2 qplot(wday, data = statusDF, facets = ~ statusSource, fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "") qplot(wday, data = statusDF, fill = statusSource, xlab = "", ylab = "")
  • 54. qplot ggplot2 > args(qplot) function (x, y = NULL, z = NULL, ..., data, facets = . ~ ., margins = FALSE, geom = "auto", stat = list(NULL), position = list(NULL), xlim = c(NA, NA), ylim = c(NA, NA), log = "", main = NULL, xlab = deparse(substitute(x)), ylab = deparse(substitute(y)), asp = NA) NULL
  • 55. qplot geom geom area: bar: histogram: line: point:
  • 56. qplot geom geom area: bar: histogram: line: point: qplot(as.integer(wday), data = statusDF, geom = "area", stat = "bin", fill = statusSource, xlab = "", ylab = "", binwidth = 1)
  • 57. qplot geom geom area: bar: histogram: line: point: qplot(wday, data = statusDF, geom = "bar", stat = "bin", fill = statusSource, xlab = "", ylab = "")
  • 58. qplot geom geom area: bar: histogram: line: point: qplot(as.integer(wday), data = statusDF, geom = "line", stat = "bin", colour = statusSource, xlab = "", ylab = "", binwidth = 1)
  • 59. qplot geom geom area: bar: histogram: line: point: qplot(wday, data = statusDF, geom = "point", stat = "bin", colour = statusSource, xlab = "", ylab = "")
  • 60. qplot position position geom dodge : fill : 1 jitter : stack :
  • 61. qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "dodge", xlab = "", ylab = "")
  • 62. qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "fill", xlab = "", ylab = "")
  • 63. qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "jitter", xlab = "", ylab = "")
  • 64. qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "stack", xlab = "", ylab = "")
  • 65. qplot facets facets geom ~ : 1 ~ 2: 1, 2 ※reshape2 1 ~ 2 + 3
  • 66. qplot facets facets geom ~ : 1 ~ 2: 1, 2 ※reshape2 1 ~ 2 + 3 qplot(wday, data = statusDF, xlab = "", ylab = "", facets = ~ statusSource)
  • 67. qplot facets facets geom ~ : 1 ~ 2: 1, 2 ※reshape2 1 ~ 2 + 3 qplot(wday, data = statusDF, xlab = "", ylab = "", facets = month ~ statusSource)
  • 68. qplot alpha : colour (color) : fill : linetype : size : colour, fill, linetype statusSource fill = I("blue") I (AsIs)
  • 69. qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", alpha = as.integer(wday))
  • 70. qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", colour = statusSource)
  • 71. qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", fill = statusSource)
  • 72. qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", linetype = statusSource, colour = statusSource)
  • 73.
  • 74.
  • 75. whotwi http://whotwi.com/
  • 76. whotwi http://whotwi.com/
  • 77. whotwi > # Twitter > # melt cast xtabs > cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF)) > head(cnt, 3) hour wday statusSource Freq 1 0 Mon YoruFukurou 48 2 1 Mon YoruFukurou 38 3 2 Mon YoruFukurou 25
  • 78. whotwi > # Twitter > # melt cast xtabs > cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF)) > head(cnt, 3) hour wday statusSource Freq 1 0 Mon YoruFukurou 48 2 1 Mon YoruFukurou 38 3 2 Mon YoruFukurou 25 > freqSources <- by(cnt, cnt[c("hour", "wday")], function(df) { + # + freqSource <- with(df, statusSource[order(Freq, decreasing = TRUE) [1]]) + cbind(df[1, c("hour", "wday")], freqSource) + }) > freqSources <- do.call(rbind, freqSources) > head(freqSources, 3) hour wday freqSource 1 0 Mon YoruFukurou 2 1 Mon YoruFukurou 3 2 Mon YoruFukurou
  • 79. whotwi > # > cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31
  • 80. whotwi > # > cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31 > # > data <- merge(cntSum, freqSources) > # > data$wday <- factor(data$wday, levels = rev(levels(data$wday))) > # > data$Freq <- log2(data$Freq)
  • 81. whotwi > # > cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31 > # > data <- merge(cntSum, freqSources) > # > data$wday <- factor(data$wday, levels = rev(levels(data$wday))) > # > data$Freq <- log2(data$Freq) > p <- qplot(hour, wday, data = data, xlab = "", ylab = "", + geom = "point", colour = freqSource, size = Freq) > p # print(p)
  • 82. whotwi > # > cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31 > # > data <- merge(cntSum, freqSources) > # > data$wday <- factor(data$wday, levels = rev(levels(data$wday))) > # > data$Freq <- log2(data$Freq) > p <- qplot(hour, wday, data = data, xlab = "", ylab = "", + geom = "point", colour = freqSource, size = Freq) > p # print(p)
  • 85. whotwi > # whotwi theme > theme_whotwi <- function() { + opts( # + panel.background = theme_rect(fill = NA, colour = NA), + # + legend.key = theme_rect(fill = NA, colour = NA), + # + axis.ticks = theme_segment(colour = NA)) + } > p2 <- p + theme_whotwi() + scale_size(legend = FALSE) + scale_colour_hue(name = "") > p2
  • 86. whotwi > # whotwi theme > theme_whotwi <- function() { + opts( # + panel.background = theme_rect(fill = NA, colour = NA), + # + legend.key = theme_rect(fill = NA, colour = NA), + # + axis.ticks = theme_segment(colour = NA)) + } > p2 <- p + theme_whotwi() + scale_size(legend = FALSE) + scale_colour_hue(name = "") > p2
  • 90. whotwi PC
  • 91. whotwi PC PC
  • 92. Twitter • • reshape2 • ggplot2 •
  • 93.
  • 94.
  • 98. RMeCab MeCab R > library(RMeCab) > (docDF(data.frame(" "), column = 1, type = 1)) number of extracted terms = 5 now making a data frame. wait a while! TERM POS1 POS2 Row1 1 1 2 1 3 1 4 2 5 2
  • 99. http://www.lr.pi.titech.ac.jp/~takamura/pndic_ja.html : : :1 : : :0.999995 : : :0.999979 : : :0.999979 : : :0.999645 : : :0.999486 : : :0.999314 ...
  • 100. > # > pndic <- read.table("http://www.lr.pi.titech.ac.jp/~takamura/pubs/ pn_ja.dic", + sep = ":", + col.names = c("term", "kana", "pos", "value"), + colClasses = c("character", "character", "factor", "numeric"), + fileEncoding = "Shift_JIS") > # > # > pndic2 <- aggregate(value ~ term + pos, pndic, mean)
  • 101. > # pndic > pos <- unique(pndic2$pos) > tweetDF <- docDF(statusDF, column = "cleanText", type = 1, pos = pos) number of extracted terms = 7164 now making a data frame. wait a while! > tweetDF[2900:2904, 1:5] TERM POS1 POS2 Row1 Row2 2900 0 0 2901 0 0 2902 0 0 2903 0 0 2904 0 0 > # pndic > tweetDF <- subset(tweetDF, TERM %in% pndic2$term) > # > tweetDF <- merge(tweetDF, pndic2, by.x = c("TERM", "POS1"), by.y = c ("term", "pos"))
  • 102. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117
  • 103. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117
  • 104. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765
  • 105. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765
  • 106. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765 > # > sum(score == 0) [1] 277
  • 107. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765 > # > sum(score == 0) [1] 277
  • 108. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765 > # > sum(score == 0) [1] 277
  • 109. > table(ifelse(pndic$value > 0, "positive", + ifelse(pndic$value == 0, "neutral", "negative"))) negative neutral positive 49983 20 5122
  • 110. > table(ifelse(pndic$value > 0, "positive", + ifelse(pndic$value == 0, "neutral", "negative"))) negative neutral positive 49983 20 5122
  • 111. > m <- mean(score) > # > tweetType <- factor(ifelse(score > m, "positive", + ifelse(score == m, "neutral", "negative")), + levels = c("positive", "neutral", "negative")) > table(tweetType) tweetType positive neutral negative 1912 0 1247
  • 112. > statusDF$tweetType <- droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 113. > statusDF$tweetType <- droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 114. > statusDF$tweetType <- droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 115. > statusDF$tweetType <- droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 116.
  • 117.
  • 118. twitteR • RJSONIO • • ID status ID • fav favorited TRUE • truncated TRUE • DM • status character factor
  • 119. twitteR • RJSONIO • • ID status ID • fav favorited TRUE • truncated TRUE • DM • status character factor
  • 120. OAuth ” ” twitteR -
  • 121.
  • 122. • twitteR • reshape2 R • ggplot2 • RMeCab R
  • 123. • twitteR • reshape2 R • ggplot2 • RMeCab R • PC •
  • 124.
  • 125.
  • 127. status > statuses[[1]]$text [1] " " > statuses[[1]]$getText() # [1] " " > # > statuses[[1]]$text <- " " > statuses[[1]]$getText() [1] " " > statuses[[1]]$setText("ggrks") # > statuses[[1]]$getText() [1] "ggrks" > # > statuses[[1]]$getCreated() [1] "2011-11-23 22:16:24 UTC"
  • 128. removeSpecialStr removeSpecialStr <- function(text) { removeURL(removeHashTag(removeScreenName(text))) }
  • 129. removeScreenName removeScreenName <- function(text, strict = TRUE) { if (strict) { regex <- "(?<!w)[@ ](?>w+)(?![@ ])" } else { regex <- "[@ ]w+" } gsub(regex, "", text, perl = TRUE) }
  • 130. removeURL removeURL <- function(text, strict = TRUE) { if (strict) { regex <- "(?<![-.w#@=!'"/])https?://(?:[^:]+:. +@)?(?:[0-9A-Za-z][-0-9A-Za-z]*(?<!-).)+[A-za-z]+(?:/[- w#%=+,.?!&~]*)*" } else { regex <- "https?://[-w#%=+,.?!&~/]+" } gsub(regex, "", text, perl = TRUE) }
  • 131. removeHashTag removeHashTag <- function(text, strict = TRUE) { delimiters <- "s,.u3000-u3002uFF01uFF1F" # cf. http://nobu666.com/2011/07/13/914.html validJa <- "u3041-u3094u3099-u309Cu30A1-u30FA u30FCu3400-uD7A3uFF10-uFF19uFF21-uFF3AuFF41-uFF5A uFF66-uFF9E" if (strict) { regex <- sprintf("(^|[%s])(?:([# ](?>[0-9]+)(?! w))|[# ][w%s]+)", delimiters, validJa, validJa) } else { regex <- sprintf("[# ][^%s]+", delimiters) } gsub(regex, "12", text, perl = TRUE) }