Skip to contents

A much faster replacement for dplyr::count.

Usage

fcount(x, ..., w = NULL, name = "N", add = FALSE,
      sort = FALSE, decreasing = FALSE)

fcountv(x, cols = NULL, w = NULL, name = "N", add = FALSE,
        sort = FALSE, ...)

Arguments

x

a data frame or list-like object, including 'grouped_df' or 'indexed_frame'. Atomic vectors or matrices can also be passed, but will be sent through qDF.

...

for fcount: names or sequences of columns to count cases by - passed to fselect. For fcountv: further arguments passed to GRP (such as decreasing, na.last, method, effect etc.). Leaving this empty will count on all columns.

cols

select columns to count cases by, using column names, indices, a logical vector or a selector function (e.g. is_categorical).

w

a numeric vector of weights, may contain missing values. In fcount this can also be the (unquoted) name of a column in the data frame. fcountv also supports a single character name. Note that the corresponding argument in dplyr::count is called wt, but collapse has a global default for weights arguments to be called w.

name

character. The name of the column containing the count or sum of weights. dplyr::count it is called "n", but "N" is more consistent with the rest of collapse and data.table.

add

TRUE adds the count column to x. Alternatively add = "group_vars" (or add = "gv" for parsimony) can be used to retain only the variables selected for counting in x and the count.

sort, decreasing

arguments passed to GRP affecting the order of rows in the output (if add = FALSE), and the algorithm used for counting. In general, sort = FALSE is faster unless data is already sorted by the columns used for counting.

Value

If x is a list, an object of the same type as x with a column (name) added at the end giving the count. Otherwise, if x is atomic, a data frame returned from qDF(x) with the count column added. By default (add = FALSE) only the unique rows of x of the columns used for counting are returned.

Examples

fcount(mtcars, cyl, vs, am)
#>   cyl vs am  N
#> 1   6  0  1  3
#> 2   4  1  1  7
#> 3   6  1  0  4
#> 4   8  0  0 12
#> 5   4  1  0  3
#> 6   4  0  1  1
#> 7   8  0  1  2
fcountv(mtcars, cols = .c(cyl, vs, am))
#>   cyl vs am  N
#> 1   6  0  1  3
#> 2   4  1  1  7
#> 3   6  1  0  4
#> 4   8  0  0 12
#> 5   4  1  0  3
#> 6   4  0  1  1
#> 7   8  0  1  2
fcount(mtcars, cyl, vs, am, sort = TRUE)
#>   cyl vs am  N
#> 1   4  0  1  1
#> 2   4  1  0  3
#> 3   4  1  1  7
#> 4   6  0  1  3
#> 5   6  1  0  4
#> 6   8  0  0 12
#> 7   8  0  1  2
fcount(mtcars, cyl, vs, am, add = TRUE)
#>                    mpg cyl disp  hp drat    wt  qsec vs am gear carb  N
#> Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4  3
#> Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4  3
#> Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1  7
#> Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1  4
#> Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2 12
#>  [ reached 'max' / getOption("max.print") -- omitted 27 rows ]
fcount(mtcars, cyl, vs, am, add = "group_vars")
#>                     cyl vs am  N
#> Mazda RX4             6  0  1  3
#> Mazda RX4 Wag         6  0  1  3
#> Datsun 710            4  1  1  7
#> Hornet 4 Drive        6  1  0  4
#> Hornet Sportabout     8  0  0 12
#> Valiant               6  1  0  4
#> Duster 360            8  0  0 12
#> Merc 240D             4  1  0  3
#> Merc 230              4  1  0  3
#> Merc 280              6  1  0  4
#> Merc 280C             6  1  0  4
#> Merc 450SE            8  0  0 12
#> Merc 450SL            8  0  0 12
#> Merc 450SLC           8  0  0 12
#> Cadillac Fleetwood    8  0  0 12
#> Lincoln Continental   8  0  0 12
#> Chrysler Imperial     8  0  0 12
#>  [ reached 'max' / getOption("max.print") -- omitted 15 rows ]

## With grouped data
mtcars |> fgroup_by(cyl, vs, am) |> fcount()
#>   cyl vs am  N
#> 1   4  0  1  1
#> 2   4  1  0  3
#> 3   4  1  1  7
#> 4   6  0  1  3
#> 5   6  1  0  4
#> 6   8  0  0 12
#> 7   8  0  1  2
mtcars |> fgroup_by(cyl, vs, am) |> fcount(add = TRUE)
#>                    mpg cyl disp  hp drat    wt  qsec vs am gear carb  N
#> Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4  3
#> Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4  3
#> Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1  7
#> Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1  4
#> Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2 12
#>  [ reached 'max' / getOption("max.print") -- omitted 27 rows ]
#> 
#> Grouped by:  cyl, vs, am  [7 | 5 (3.8) 1-12] 
mtcars |> fgroup_by(cyl, vs, am) |> fcount(add = "group_vars")
#>                     cyl vs am  N
#> Mazda RX4             6  0  1  3
#> Mazda RX4 Wag         6  0  1  3
#> Datsun 710            4  1  1  7
#> Hornet 4 Drive        6  1  0  4
#> Hornet Sportabout     8  0  0 12
#> Valiant               6  1  0  4
#> Duster 360            8  0  0 12
#> Merc 240D             4  1  0  3
#> Merc 230              4  1  0  3
#> Merc 280              6  1  0  4
#> Merc 280C             6  1  0  4
#> Merc 450SE            8  0  0 12
#> Merc 450SL            8  0  0 12
#> Merc 450SLC           8  0  0 12
#> Cadillac Fleetwood    8  0  0 12
#> Lincoln Continental   8  0  0 12
#> Chrysler Imperial     8  0  0 12
#>  [ reached 'max' / getOption("max.print") -- omitted 15 rows ]
#> 
#> Grouped by:  cyl, vs, am  [7 | 5 (3.8) 1-12] 

## With indexed data: by default counting on the first index variable
wlddev |> findex_by(country, year) |> fcount()
#>                   country  N
#> 1             Afghanistan 61
#> 2                 Albania 61
#> 3                 Algeria 61
#> 4          American Samoa 61
#> 5                 Andorra 61
#> 6                  Angola 61
#> 7     Antigua and Barbuda 61
#> 8               Argentina 61
#> 9                 Armenia 61
#> 10                  Aruba 61
#> 11              Australia 61
#> 12                Austria 61
#> 13             Azerbaijan 61
#> 14           Bahamas, The 61
#> 15                Bahrain 61
#> 16             Bangladesh 61
#> 17               Barbados 61
#> 18                Belarus 61
#> 19                Belgium 61
#> 20                 Belize 61
#> 21                  Benin 61
#> 22                Bermuda 61
#> 23                 Bhutan 61
#> 24                Bolivia 61
#> 25 Bosnia and Herzegovina 61
#> 26               Botswana 61
#> 27                 Brazil 61
#> 28 British Virgin Islands 61
#> 29      Brunei Darussalam 61
#> 30               Bulgaria 61
#> 31           Burkina Faso 61
#> 32                Burundi 61
#> 33             Cabo Verde 61
#> 34               Cambodia 61
#> 35               Cameroon 61
#>  [ reached 'max' / getOption("max.print") -- omitted 181 rows ]
wlddev |> findex_by(country, year) |> fcount(add = TRUE)
#>       country iso3c       date year decade     region     income  OECD PCGDP
#> 1 Afghanistan   AFG 1961-01-01 1960   1960 South Asia Low income FALSE    NA
#> 2 Afghanistan   AFG 1962-01-01 1961   1960 South Asia Low income FALSE    NA
#> 3 Afghanistan   AFG 1963-01-01 1962   1960 South Asia Low income FALSE    NA
#> 4 Afghanistan   AFG 1964-01-01 1963   1960 South Asia Low income FALSE    NA
#> 5 Afghanistan   AFG 1965-01-01 1964   1960 South Asia Low income FALSE    NA
#>   LIFEEX GINI       ODA     POP  N
#> 1 32.446   NA 116769997 8996973 61
#> 2 32.962   NA 232080002 9169410 61
#> 3 33.471   NA 112839996 9351441 61
#> 4 33.971   NA 237720001 9543205 61
#> 5 34.463   NA 295920013 9744781 61
#>  [ reached 'max' / getOption("max.print") -- omitted 13171 rows ]
#> 
#> Indexed by:  country [216] | year [61] 
# Use fcountv to pass additional arguments to GRP.pdata.frame,
# here using the effect argument to choose a different index variable
wlddev |> findex_by(country, year) |> fcountv(effect = "year")
#>    year   N
#> 1  1960 216
#> 2  1961 216
#> 3  1962 216
#> 4  1963 216
#> 5  1964 216
#> 6  1965 216
#> 7  1966 216
#> 8  1967 216
#> 9  1968 216
#> 10 1969 216
#> 11 1970 216
#> 12 1971 216
#> 13 1972 216
#> 14 1973 216
#> 15 1974 216
#> 16 1975 216
#> 17 1976 216
#> 18 1977 216
#> 19 1978 216
#> 20 1979 216
#> 21 1980 216
#> 22 1981 216
#> 23 1982 216
#> 24 1983 216
#> 25 1984 216
#> 26 1985 216
#> 27 1986 216
#> 28 1987 216
#> 29 1988 216
#> 30 1989 216
#> 31 1990 216
#> 32 1991 216
#> 33 1992 216
#> 34 1993 216
#> 35 1994 216
#>  [ reached 'max' / getOption("max.print") -- omitted 26 rows ]
wlddev |> findex_by(country, year) |> fcountv(add = "group_vars", effect = "year")
#>    year   N
#> 1  1960 216
#> 2  1961 216
#> 3  1962 216
#> 4  1963 216
#> 5  1964 216
#> 6  1965 216
#> 7  1966 216
#> 8  1967 216
#> 9  1968 216
#> 10 1969 216
#> 11 1970 216
#> 12 1971 216
#> 13 1972 216
#> 14 1973 216
#> 15 1974 216
#> 16 1975 216
#> 17 1976 216
#> 18 1977 216
#> 19 1978 216
#> 20 1979 216
#> 21 1980 216
#> 22 1981 216
#> 23 1982 216
#> 24 1983 216
#> 25 1984 216
#> 26 1985 216
#> 27 1986 216
#> 28 1987 216
#> 29 1988 216
#> 30 1989 216
#> 31 1990 216
#> 32 1991 216
#> 33 1992 216
#> 34 1993 216
#> 35 1994 216
#>  [ reached 'max' / getOption("max.print") -- omitted 13141 rows ]
#> 
#> Indexed by:  country [216] | year [61]