dataframe - R: transfer the pattern of data.table -
my original data mydf(no duplicated) :
    group hed_pfnpi id  1:    aa    111111 18  2:    aa    111111 17  3:    aa    222222 18  4:    aa    333333 14  5:    aa    444444 13  6:    aa    555555 18  7:    aa    555555 24  8:    aa    222222 13  9:    aa    222222 17 10:    aa    333333 17 11:    bb    666666  9 12:    bb    666666  3 13:    bb    888888  9 14:    bb    999999 14 15:    bb    444444 13 16:    bb    555555  9 17:    bb    555555 24 18:    bb    888888 13 19:    bb    888888  3 20:    bb    999999  3 and want transfer mydf result table:
   group    1    2 weight id_list  1    aa 111111 222222      2   17,18  2    aa 111111 333333      1      17  3    aa 111111 555555      1      18  4    aa 222222 333333      1      17  5    aa 222222 444444      1      13  6    aa 222222 555555      1      18  7    bb 444444 888888      1      13  8    bb 555555 666666      1       9  9    bb 555555 888888      1       9 10    bb 666666 888888      2     3,9 11    bb 666666 999999      1       3 12    bb 888888 999999      1       3 first, group data according group column then
if hed_pfnpi share same id, become pair in result table;
id_list: correspondingly shared id;
weight: length of id_list
library(data.table) library(dplyr) library(magrittr) library(tidyverse)   mydf1 <- data.table(structure(list(group = rep("aa",10),hed_pfnpi = c(111111l, 111111l, 222222l, 333333l, 444444l,                                             555555l, 555555l, 222222l, 222222l, 333333l), id = c(18l, 17l,                                                                                                    18l, 14l, 13l, 18l, 24l, 13l, 17l, 17l)), .names = c("group","hed_pfnpi",                                                                                                                                                        "id"), class = "data.frame", row.names = c(na, -10l))) mydf2 <- data.table(structure(list(group = rep("bb",10),hed_pfnpi = c(666666l, 666666l, 888888l, 999999l, 444444l,                                              555555l, 555555l, 888888l, 888888l, 999999l), id = c(9l, 3l,                                                                                                     9l, 14l, 13l, 9l, 24l, 13l, 3l, 3l)), .names = c("group","hed_pfnpi",                                                                                                                                                     "id"), class = "data.frame", row.names = c(na, -10l))) mydf <- rbind(mydf1,mydf2)   # try code result <- merge(mydf, mydf, = "id", allow.cartesian=true) %>%    filter(group.x == group.y) %>%   transmute(group = group.x,             1 = pmin(hed_pfnpi.x, hed_pfnpi.y),             2 = pmax(hed_pfnpi.x, hed_pfnpi.y),             id) %>%    filter(one != two) %>%    unique() %>%    group_by(group,one, two) %>%    summarise(id_list = paste(id, collapse = ","),             weight = n()) %>%   select(group,one, two,weight, id_list) my try code here, can expected result, it's inefficient (crashed when data large). hope 1 can provide me better solution.
i'd (loading data.table , not other packages)...
mydf[,    cj(one = hed_pfnpi, 2 = hed_pfnpi)[one < two] , keyby=.(group, id)][,    .(n = .n, ids = tostring(id)) , keyby=.(group, one, two)] which gives
    group    1    2 n    ids  1:    aa 111111 222222 2 17, 18  2:    aa 111111 333333 1     17  3:    aa 111111 555555 1     18  4:    aa 222222 333333 1     17  5:    aa 222222 444444 1     13  6:    aa 222222 555555 1     18  7:    bb 444444 888888 1     13  8:    bb 555555 666666 1      9  9:    bb 555555 888888 1      9 10:    bb 666666 888888 2   3, 9 11:    bb 666666 999999 1      3 12:    bb 888888 999999 1      3 
Comments
Post a Comment