scikit-learn/sklearn/base.py at f5aac217372759d4d35d69934199be878c3bcc65 · scikit-learn/scikit-learn

History

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

"""Base classes for all estimators and various utility functions."""

# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>

# License: BSD 3 clause

import copy

import functools

import inspect

import platform

import re

import warnings

from collections import defaultdict

import numpy as np

from . import __version__

from ._config import config_context, get_config

from .exceptions import InconsistentVersionWarning

from .utils._estimator_html_repr import _HTMLDocumentationLinkMixin, estimator_html_repr

from .utils._metadata_requests import _MetadataRequester, _routing_enabled

from .utils._param_validation import validate_parameter_constraints

from .utils._set_output import _SetOutputMixin

from .utils._tags import (

_DEFAULT_TAGS,

)

from .utils.fixes import _IS_32BIT

from .utils.validation import (

_check_feature_names_in,

_check_y,

_generate_get_feature_names_out,

_get_feature_names,

_is_fitted,

_num_features,

check_array,

check_is_fitted,

check_X_y,

)

def clone(estimator, *, safe=True):

"""Construct a new unfitted estimator with the same parameters.

Clone does a deep copy of the model in an estimator

without actually copying attached data. It returns a new estimator

with the same parameters that has not been fitted on any data.

.. versionchanged:: 1.3

Delegates to `estimator.__sklearn_clone__` if the method exists.

Parameters

----------

estimator : {list, tuple, set} of estimator instance or a single \

estimator instance

The estimator or group of estimators to be cloned.

safe : bool, default=True

If safe is False, clone will fall back to a deep copy on objects

that are not estimators. Ignored if `estimator.__sklearn_clone__`

exists.

Returns

-------

estimator : object

The deep copy of the input, an estimator if input is an estimator.

Notes

-----

If the estimator's `random_state` parameter is an integer (or if the

estimator doesn't have a `random_state` parameter), an *exact clone* is

returned: the clone and the original estimator will give the exact same

results. Otherwise, *statistical clone* is returned: the clone might

return different results from the original estimator. More details can be

found in :ref:`randomness`.

Examples

--------

>>> from sklearn.base import clone

>>> from sklearn.linear_model import LogisticRegression

>>> X = [[-1, 0], [0, 1], [0, -1], [1, 0]]

>>> y = [0, 0, 1, 1]

>>> classifier = LogisticRegression().fit(X, y)

>>> cloned_classifier = clone(classifier)

>>> hasattr(classifier, "classes_")

True

>>> hasattr(cloned_classifier, "classes_")

False

>>> classifier is cloned_classifier

False

"""

if hasattr(estimator, "__sklearn_clone__") and not inspect.isclass(estimator):

return estimator.__sklearn_clone__()

return _clone_parametrized(estimator, safe=safe)

def _clone_parametrized(estimator, *, safe=True):

"""Default implementation of clone. See :func:`sklearn.base.clone` for details."""

estimator_type = type(estimator)

if estimator_type is dict:

return {k: clone(v, safe=safe) for k, v in estimator.items()}

elif estimator_type in (list, tuple, set, frozenset):

return estimator_type([clone(e, safe=safe) for e in estimator])

elif not hasattr(estimator, "get_params") or isinstance(estimator, type):

if not safe:

return copy.deepcopy(estimator)

else:

if isinstance(estimator, type):

raise TypeError(

"Cannot clone object. "

+ "You should provide an instance of "

+ "scikit-learn estimator instead of a class."

)

else:

raise TypeError(

"Cannot clone object '%s' (type %s): "

"it does not seem to be a scikit-learn "

"estimator as it does not implement a "

"'get_params' method." % (repr(estimator), type(estimator))

)

klass = estimator.__class__

new_object_params = estimator.get_params(deep=False)

for name, param in new_object_params.items():

new_object_params[name] = clone(param, safe=False)

new_object = klass(**new_object_params)

try:

new_object._metadata_request = copy.deepcopy(estimator._metadata_request)

except AttributeError:

pass

params_set = new_object.get_params(deep=False)

# quick sanity check of the parameters of the clone

for name in new_object_params:

param1 = new_object_params[name]

param2 = params_set[name]

if param1 is not param2:

raise RuntimeError(

"Cannot clone object %s, as the constructor "

"either does not set or modifies parameter %s" % (estimator, name)

)

# _sklearn_output_config is used by `set_output` to configure the output

# container of an estimator.

if hasattr(estimator, "_sklearn_output_config"):

new_object._sklearn_output_config = copy.deepcopy(

estimator._sklearn_output_config

)

return new_object

class BaseEstimator(_HTMLDocumentationLinkMixin, _MetadataRequester):

"""Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

- setting and getting parameters used by `GridSearchCV` and friends;

- textual and HTML representation displayed in terminals and IDEs;

- estimator serialization;

- parameters validation;

- data validation;

- feature names validation.

Read more in the :ref:`User Guide <rolling_your_own_estimator>`.

Notes

-----

All estimators should specify all the parameters that can be set

at the class level in their ``__init__`` as explicit keyword

arguments (no ``*args`` or ``**kwargs``).

Examples

--------

>>> import numpy as np

>>> from sklearn.base import BaseEstimator

>>> class MyEstimator(BaseEstimator):

... def __init__(self, *, param=1):

... self.param = param

... def fit(self, X, y=None):

... self.is_fitted_ = True

... return self

... def predict(self, X):

... return np.full(shape=X.shape[0], fill_value=self.param)

>>> estimator = MyEstimator(param=2)

>>> estimator.get_params()

{'param': 2}

>>> X = np.array([[1, 2], [2, 3], [3, 4]])

>>> y = np.array([1, 0, 1])

>>> estimator.fit(X, y).predict(X)

array([2, 2, 2])

>>> estimator.set_params(param=3).fit(X, y).predict(X)

array([3, 3, 3])

"""

@classmethod

def _get_param_names(cls):

"""Get parameter names for the estimator"""

# fetch the constructor or the original constructor before

# deprecation wrapping if any

init = getattr(cls.__init__, "deprecated_original", cls.__init__)

if init is object.__init__:

# No explicit constructor to introspect

return []

# introspect the constructor arguments to find the model parameters

# to represent

init_signature = inspect.signature(init)

# Consider the constructor parameters excluding 'self'

parameters = [

for p in init_signature.parameters.values()

if p.name != "self" and p.kind != p.VAR_KEYWORD

]

for p in parameters:

if p.kind == p.VAR_POSITIONAL:

raise RuntimeError(

"scikit-learn estimators should always "

"specify their parameters in the signature"

" of their __init__ (no varargs)."

" %s with constructor %s doesn't "

" follow this convention." % (cls, init_signature)

)

# Extract and sort argument names excluding 'self'

return sorted([p.name for p in parameters])

def get_params(self, deep=True):

"""

Get parameters for this estimator.

Parameters

----------

deep : bool, default=True

If True, will return the parameters for this estimator and

contained subobjects that are estimators.

Returns

-------

params : dict

Parameter names mapped to their values.

"""

out = dict()

for key in self._get_param_names():

value = getattr(self, key)

if deep and hasattr(value, "get_params") and not isinstance(value, type):

deep_items = value.get_params().items()

out.update((key + "__" + k, val) for k, val in deep_items)

out[key] = value

return out

def set_params(self, **params):

"""Set the parameters of this estimator.

The method works on simple estimators as well as on nested objects

(such as :class:`~sklearn.pipeline.Pipeline`). The latter have

parameters of the form ``<component>__<parameter>`` so that it's

possible to update each component of a nested object.

Parameters

----------

**params : dict

Estimator parameters.

Returns

-------

self : estimator instance

Estimator instance.

"""

if not params:

# Simple optimization to gain speed (inspect is slow)

return self

valid_params = self.get_params(deep=True)

nested_params = defaultdict(dict) # grouped by prefix

for key, value in params.items():

key, delim, sub_key = key.partition("__")

if key not in valid_params:

local_valid_params = self._get_param_names()

raise ValueError(

f"Invalid parameter {key!r} for estimator {self}. "

f"Valid parameters are: {local_valid_params!r}."

)

if delim:

nested_params[key][sub_key] = value

else:

setattr(self, key, value)

valid_params[key] = value

for key, sub_params in nested_params.items():

valid_params[key].set_params(**sub_params)

return self

def __sklearn_clone__(self):

return _clone_parametrized(self)

def __repr__(self, N_CHAR_MAX=700):

# N_CHAR_MAX is the (approximate) maximum number of non-blank

# characters to render. We pass it as an optional parameter to ease

# the tests.

from .utils._pprint import _EstimatorPrettyPrinter

N_MAX_ELEMENTS_TO_SHOW = 30 # number of elements to show in sequences

# use ellipsis for sequences with a lot of elements

pp = _EstimatorPrettyPrinter(

compact=True,

indent=1,

indent_at_name=True,

n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW,

)

repr_ = pp.pformat(self)

# Use bruteforce ellipsis when there are a lot of non-blank characters

n_nonblank = len("".join(repr_.split()))

if n_nonblank > N_CHAR_MAX:

lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends

regex = r"^(\s*\S){%d}" % lim

# The regex '^(\s*\S){%d}' % n

# matches from the start of the string until the nth non-blank

# character:

# - ^ matches the start of string

# - (pattern){n} matches n repetitions of pattern

# - \s*\S matches a non-blank char following zero or more blanks

left_lim = re.match(regex, repr_).end()

right_lim = re.match(regex, repr_[::-1]).end()

if "\n" in repr_[left_lim:-right_lim]:

# The left side and right side aren't on the same line.

# To avoid weird cuts, e.g.:

# categoric...ore',

# we need to start the right side with an appropriate newline

# character so that it renders properly as:

# categoric...

# handle_unknown='ignore',

# so we add [^\n]*\n which matches until the next \n

regex += r"[^\n]*\n"

right_lim = re.match(regex, repr_[::-1]).end()

ellipsis = "..."

if left_lim + len(ellipsis) < len(repr_) - right_lim:

# Only add ellipsis if it results in a shorter repr

repr_ = repr_[:left_lim] + "..." + repr_[-right_lim:]

return repr_

def __getstate__(self):

if getattr(self, "__slots__", None):

raise TypeError(

"You cannot use `__slots__` in objects inheriting from "

"`sklearn.base.BaseEstimator`."

)

try:

state = super().__getstate__()

if state is None:

# For Python 3.11+, empty instance (no `__slots__`,

# and `__dict__`) will return a state equal to `None`.

state = self.__dict__.copy()

except AttributeError:

# Python < 3.11

state = self.__dict__.copy()

if type(self).__module__.startswith("sklearn."):

return dict(state.items(), _sklearn_version=__version__)

else:

return state

def __setstate__(self, state):

if type(self).__module__.startswith("sklearn."):

pickle_version = state.pop("_sklearn_version", "pre-0.18")

if pickle_version != __version__:

warnings.warn(

InconsistentVersionWarning(

estimator_name=self.__class__.__name__,

current_sklearn_version=__version__,

original_sklearn_version=pickle_version,

)

try:

super().__setstate__(state)

except AttributeError:

self.__dict__.update(state)

def _more_tags(self):

return _DEFAULT_TAGS

def _get_tags(self):

collected_tags = {}

for base_class in reversed(inspect.getmro(self.__class__)):

if hasattr(base_class, "_more_tags"):

# need the if because mixins might not have _more_tags

# but might do redundant work in estimators

# (i.e. calling more tags on BaseEstimator multiple times)

more_tags = base_class._more_tags(self)

collected_tags.update(more_tags)

return collected_tags

def _check_n_features(self, X, reset):

"""Set the `n_features_in_` attribute, or check against it.

Parameters

----------

X : {ndarray, sparse matrix} of shape (n_samples, n_features)

The input samples.

reset : bool

If True, the `n_features_in_` attribute is set to `X.shape[1]`.

If False and the attribute exists, then check that it is equal to

`X.shape[1]`. If False and the attribute does *not* exist, then

the check is skipped.

.. note::

It is recommended to call reset=True in `fit` and in the first

call to `partial_fit`. All other methods that validate `X`

should set `reset=False`.

"""

try:

n_features = _num_features(X)

except TypeError as e:

if not reset and hasattr(self, "n_features_in_"):

raise ValueError(

"X does not contain any features, but "

f"{self.__class__.__name__} is expecting "

f"{self.n_features_in_} features"

) from e

# If the number of features is not defined and reset=True,

# then we skip this check

return

if reset:

self.n_features_in_ = n_features

return

if not hasattr(self, "n_features_in_"):

# Skip this check if the expected number of expected input features

# was not recorded by calling fit first. This is typically the case

# for stateless transformers.

return

if n_features != self.n_features_in_:

raise ValueError(

f"X has {n_features} features, but {self.__class__.__name__} "

f"is expecting {self.n_features_in_} features as input."

)

def _check_feature_names(self, X, *, reset):

"""Set or check the `feature_names_in_` attribute.

.. versionadded:: 1.0

Parameters

----------

X : {ndarray, dataframe} of shape (n_samples, n_features)

The input samples.

reset : bool

Whether to reset the `feature_names_in_` attribute.

If False, the input will be checked for consistency with

feature names of data provided when reset was last True.

.. note::

It is recommended to call `reset=True` in `fit` and in the first

call to `partial_fit`. All other methods that validate `X`

should set `reset=False`.

"""

if reset:

feature_names_in = _get_feature_names(X)

if feature_names_in is not None:

self.feature_names_in_ = feature_names_in

elif hasattr(self, "feature_names_in_"):

# Delete the attribute when the estimator is fitted on a new dataset

# that has no feature names.

delattr(self, "feature_names_in_")

return

fitted_feature_names = getattr(self, "feature_names_in_", None)

X_feature_names = _get_feature_names(X)

if fitted_feature_names is None and X_feature_names is None:

# no feature names seen in fit and in X

return

if X_feature_names is not None and fitted_feature_names is None:

warnings.warn(

f"X has feature names, but {self.__class__.__name__} was fitted without"

" feature names"

)

return

if X_feature_names is None and fitted_feature_names is not None:

warnings.warn(

"X does not have valid feature names, but"

f" {self.__class__.__name__} was fitted with feature names"

)

return

# validate the feature names against the `feature_names_in_` attribute

if len(fitted_feature_names) != len(X_feature_names) or np.any(

fitted_feature_names != X_feature_names

message = (

"The feature names should match those that were passed during fit.\n"

)

fitted_feature_names_set = set(fitted_feature_names)

X_feature_names_set = set(X_feature_names)

unexpected_names = sorted(X_feature_names_set - fitted_feature_names_set)

missing_names = sorted(fitted_feature_names_set - X_feature_names_set)

def add_names(names):

output = ""

max_n_names = 5

for i, name in enumerate(names):

if i >= max_n_names:

output += "- ...\n"

break

output += f"- {name}\n"

return output

if unexpected_names:

message += "Feature names unseen at fit time:\n"

message += add_names(unexpected_names)

if missing_names:

message += "Feature names seen at fit time, yet now missing:\n"

message += add_names(missing_names)

if not missing_names and not unexpected_names:

message += (

"Feature names must be in the same order as they were in fit.\n"

)

raise ValueError(message)

def _validate_data(

self,

X="no_validation",

y="no_validation",

reset=True,

validate_separately=False,

cast_to_ndarray=True,

**check_params,

"""Validate input data and set or check the `n_features_in_` attribute.

Parameters

----------

X : {array-like, sparse matrix, dataframe} of shape \

(n_samples, n_features), default='no validation'

The input samples.

If `'no_validation'`, no validation is performed on `X`. This is

useful for meta-estimator which can delegate input validation to

their underlying estimator(s). In that case `y` must be passed and

the only accepted `check_params` are `multi_output` and

`y_numeric`.

y : array-like of shape (n_samples,), default='no_validation'

The targets.

- If `None`, `check_array` is called on `X`. If the estimator's

requires_y tag is True, then an error will be raised.

- If `'no_validation'`, `check_array` is called on `X` and the

estimator's requires_y tag is ignored. This is a default

placeholder and is never meant to be explicitly set. In that case

`X` must be passed.

- Otherwise, only `y` with `_check_y` or both `X` and `y` are

checked with either `check_array` or `check_X_y` depending on

`validate_separately`.

reset : bool, default=True

Whether to reset the `n_features_in_` attribute.

If False, the input will be checked for consistency with data

provided when reset was last True.

.. note::

It is recommended to call reset=True in `fit` and in the first

call to `partial_fit`. All other methods that validate `X`

should set `reset=False`.

validate_separately : False or tuple of dicts, default=False

Only used if y is not None.

If False, call validate_X_y(). Else, it must be a tuple of kwargs

to be used for calling check_array() on X and y respectively.

`estimator=self` is automatically added to these dicts to generate

more informative error message in case of invalid input data.

cast_to_ndarray : bool, default=True

Cast `X` and `y` to ndarray with checks in `check_params`. If

`False`, `X` and `y` are unchanged and only `feature_names_in_` and

`n_features_in_` are checked.

**check_params : kwargs

Parameters passed to :func:`sklearn.utils.check_array` or

:func:`sklearn.utils.check_X_y`. Ignored if validate_separately

is not False.

`estimator=self` is automatically added to these params to generate

more informative error message in case of invalid input data.

Returns

-------

out : {ndarray, sparse matrix} or tuple of these

The validated input. A tuple is returned if both `X` and `y` are

validated.

"""

self._check_feature_names(X, reset=reset)

if y is None and self._get_tags()["requires_y"]:

raise ValueError(

f"This {self.__class__.__name__} estimator "

"requires y to be passed, but the target y is None."

)

no_val_X = isinstance(X, str) and X == "no_validation"

no_val_y = y is None or isinstance(y, str) and y == "no_validation"

if no_val_X and no_val_y:

raise ValueError("Validation should be done on X, y or both.")

default_check_params = {"estimator": self}

check_params = {**default_check_params, **check_params}

if not cast_to_ndarray:

if not no_val_X and no_val_y:

out = X

elif no_val_X and not no_val_y:

out = y

else:

out = X, y

elif not no_val_X and no_val_y:

out = check_array(X, input_name="X", **check_params)

elif no_val_X and not no_val_y:

out = _check_y(y, **check_params)

else:

if validate_separately:

# We need this because some estimators validate X and y

# separately, and in general, separately calling check_array()

# on X and y isn't equivalent to just calling check_X_y()

# :(

check_X_params, check_y_params = validate_separately

if "estimator" not in check_X_params:

check_X_params = {**default_check_params, **check_X_params}

X = check_array(X, input_name="X", **check_X_params)

if "estimator" not in check_y_params:

check_y_params = {**default_check_params, **check_y_params}

y = check_array(y, input_name="y", **check_y_params)

else:

X, y = check_X_y(X, y, **check_params)

out = X, y

if not no_val_X and check_params.get("ensure_2d", True):

self._check_n_features(X, reset=reset)

return out

def _validate_params(self):

"""Validate types and values of constructor parameters

The expected type and values must be defined in the `_parameter_constraints`

class attribute, which is a dictionary `param_name: list of constraints`. See

the docstring of `validate_parameter_constraints` for a description of the

accepted constraints.

"""

validate_parameter_constraints(

self._parameter_constraints,

self.get_params(deep=False),

caller_name=self.__class__.__name__,

)

@property

def _repr_html_(self):

"""HTML representation of estimator.

This is redundant with the logic of `_repr_mimebundle_`. The latter

should be favorted in the long term, `_repr_html_` is only

implemented for consumers who do not interpret `_repr_mimbundle_`.

"""

if get_config()["display"] != "diagram":

raise AttributeError(

"_repr_html_ is only defined when the "

"'display' configuration option is set to "

"'diagram'"

)

return self._repr_html_inner

def _repr_html_inner(self):

"""This function is returned by the @property `_repr_html_` to make

`hasattr(estimator, "_repr_html_") return `True` or `False` depending

on `get_config()["display"]`.

"""

return estimator_html_repr(self)

def _repr_mimebundle_(self, **kwargs):

"""Mime bundle used by jupyter kernels to display estimator"""

output = {"text/plain": repr(self)}

if get_config()["display"] == "diagram":

output["text/html"] = estimator_html_repr(self)

return output

class ClassifierMixin:

"""Mixin class for all classifiers in scikit-learn.

This mixin defines the following functionality:

- `_estimator_type` class attribute defaulting to `"classifier"`;

- `score` method that default to :func:`~sklearn.metrics.accuracy_score`.

- enforce that `fit` requires `y` to be passed through the `requires_y` tag.

Read more in the :ref:`User Guide <rolling_your_own_estimator>`.

Examples

--------

>>> import numpy as np

>>> from sklearn.base import BaseEstimator, ClassifierMixin

>>> # Mixin classes should always be on the left-hand side for a correct MRO

>>> class MyEstimator(ClassifierMixin, BaseEstimator):

... def __init__(self, *, param=1):

... self.param = param

... def fit(self, X, y=None):

... self.is_fitted_ = True

... return self

... def predict(self, X):

... return np.full(shape=X.shape[0], fill_value=self.param)

>>> estimator = MyEstimator(param=1)

>>> X = np.array([[1, 2], [2, 3], [3, 4]])

>>> y = np.array([1, 0, 1])

>>> estimator.fit(X, y).predict(X)

array([1, 1, 1])

>>> estimator.score(X, y)

0.66...

"""

_estimator_type = "classifier"

def score(self, X, y, sample_weight=None):

"""

Return the mean accuracy on the given test data and labels.

In multi-label classification, this is the subset accuracy

which is a harsh metric since you require for each sample that

each label set be correctly predicted.

Parameters

----------

X : array-like of shape (n_samples, n_features)

Test samples.

y : array-like of shape (n_samples,) or (n_samples, n_outputs)

True labels for `X`.

sample_weight : array-like of shape (n_samples,), default=None

Sample weights.

Returns

-------

score : float

Mean accuracy of ``self.predict(X)`` w.r.t. `y`.

"""

from .metrics import accuracy_score

return accuracy_score(y, self.predict(X), sample_weight=sample_weight)

def _more_tags(self):

return {"requires_y": True}

class RegressorMixin:

"""Mixin class for all regression estimators in scikit-learn.

This mixin defines the following functionality:

- `_estimator_type` class attribute defaulting to `"regressor"`;

- `score` method that default to :func:`~sklearn.metrics.r2_score`.

- enforce that `fit` requires `y` to be passed through the `requires_y` tag.

Read more in the :ref:`User Guide <rolling_your_own_estimator>`.

Examples

--------

>>> import numpy as np

>>> from sklearn.base import BaseEstimator, RegressorMixin

>>> # Mixin classes should always be on the left-hand side for a correct MRO

>>> class MyEstimator(RegressorMixin, BaseEstimator):

... def __init__(self, *, param=1):

... self.param = param

... def fit(self, X, y=None):

... self.is_fitted_ = True

... return self

... def predict(self, X):

... return np.full(shape=X.shape[0], fill_value=self.param)

>>> estimator = MyEstimator(param=0)

>>> X = np.array([[1, 2], [2, 3], [3, 4]])

>>> y = np.array([-1, 0, 1])

>>> estimator.fit(X, y).predict(X)

array([0, 0, 0])

>>> estimator.score(X, y)

0.0

"""

_estimator_type = "regressor"

def score(self, X, y, sample_weight=None):

"""Return the coefficient of determination of the prediction.

The coefficient of determination :math:`R^2` is defined as

:math:`(1 - \\frac{u}{v})`, where :math:`u` is the residual

sum of squares ``((y_true - y_pred)** 2).sum()`` and :math:`v`

is the total sum of squares ``((y_true - y_true.mean()) ** 2).sum()``.

The best possible score is 1.0 and it can be negative (because the

model can be arbitrarily worse). A constant model that always predicts

the expected value of `y`, disregarding the input features, would get

a :math:`R^2` score of 0.0.

Parameters

----------

X : array-like of shape (n_samples, n_features)

Test samples. For some estimators this may be a precomputed

kernel matrix or a list of generic objects instead with shape

``(n_samples, n_samples_fitted)``, where ``n_samples_fitted``

is the number of samples used in the fitting for the estimator.

y : array-like of shape (n_samples,) or (n_samples, n_outputs)

True values for `X`.

sample_weight : array-like of shape (n_samples,), default=None

Sample weights.

Returns

-------

score : float

:math:`R^2` of ``self.predict(X)`` w.r.t. `y`.

Notes

-----

The :math:`R^2` score used when calling ``score`` on a regressor uses

``multioutput='uniform_average'`` from version 0.23 to keep consistent

with default value of :func:`~sklearn.metrics.r2_score`.

This influences the ``score`` method of all the multioutput

regressors (except for

:class:`~sklearn.multioutput.MultiOutputRegressor`).

"""

from .metrics import r2_score

y_pred = self.predict(X)

return r2_score(y, y_pred, sample_weight=sample_weight)

def _more_tags(self):

return {"requires_y": True}

class ClusterMixin:

"""Mixin class for all cluster estimators in scikit-learn.

- `_estimator_type` class attribute defaulting to `"clusterer"`;

- `fit_predict` method returning the cluster labels associated to each sample.

Examples

--------

>>> import numpy as np

>>> from sklearn.base import BaseEstimator, ClusterMixin

>>> class MyClusterer(ClusterMixin, BaseEstimator):

... def fit(self, X, y=None):

... self.labels_ = np.ones(shape=(len(X),), dtype=np.int64)

... return self

>>> X = [[1, 2], [2, 3], [3, 4]]

>>> MyClusterer().fit_predict(X)

array([1, 1, 1])

"""

_estimator_type = "clusterer"

def fit_predict(self, X, y=None, **kwargs):

"""

Perform clustering on `X` and returns cluster labels.

Parameters

----------

X : array-like of shape (n_samples, n_features)

Input data.

y : Ignored

Not used, present for API consistency by convention.

**kwargs : dict

Arguments to be passed to ``fit``.

.. versionadded:: 1.4

Returns

-------

labels : ndarray of shape (n_samples,), dtype=np.int64

Cluster labels.

"""

# non-optimized default implementation; override when a better

# method is possible for a given clustering algorithm

self.fit(X, **kwargs)

return self.labels_

def _more_tags(self):

return {"preserves_dtype": []}

class BiclusterMixin:

"""Mixin class for all bicluster estimators in scikit-learn.

This mixin defines the following functionality:

- `biclusters_` property that returns the row and column indicators;

- `get_indices` method that returns the row and column indices of a bicluster;

- `get_shape` method that returns the shape of a bicluster;

- `get_submatrix` method that returns the submatrix corresponding to a bicluster.

Examples

--------

>>> import numpy as np

>>> from sklearn.base import BaseEstimator, BiclusterMixin

>>> class DummyBiClustering(BiclusterMixin, BaseEstimator):

... def fit(self, X, y=None):

... self.rows_ = np.ones(shape=(1, X.shape[0]), dtype=bool)

... self.columns_ = np.ones(shape=(1, X.shape[1]), dtype=bool)

... return self

>>> X = np.array([[1, 1], [2, 1], [1, 0],

... [4, 7], [3, 5], [3, 6]])

>>> bicluster = DummyBiClustering().fit(X)

>>> hasattr(bicluster, "biclusters_")

True

>>> bicluster.get_indices(0)

(array([0, 1, 2, 3, 4, 5]), array([0, 1]))

"""

@property

def biclusters_(self):

"""Convenient way to get row and column indicators together.

Returns the ``rows_`` and ``columns_`` members.

"""

return self.rows_, self.columns_

def get_indices(self, i):

"""Row and column indices of the `i`'th bicluster.

Only works if ``rows_`` and ``columns_`` attributes exist.

Parameters

----------

i : int

The index of the cluster.

Returns

-------

row_ind : ndarray, dtype=np.intp

Indices of rows in the dataset that belong to the bicluster.

col_ind : ndarray, dtype=np.intp

Indices of columns in the dataset that belong to the bicluster.

"""

rows = self.rows_[i]

columns = self.columns_[i]

return np.nonzero(rows)[0], np.nonzero(columns)[0]

def get_shape(self, i):

"""Shape of the `i`'th bicluster.

Parameters

----------

i : int

The index of the cluster.

Returns

-------

n_rows : int

Number of rows in the bicluster.

n_cols : int

Number of columns in the bicluster.

"""

indices = self.get_indices(i)

return tuple(len(i) for i in indices)

def get_submatrix(self, i, data):

"""Return the submatrix corresponding to bicluster `i`.

Parameters

----------

i : int

The index of the cluster.

data : array-like of shape (n_samples, n_features)

The data.

Returns

-------

submatrix : ndarray of shape (n_rows, n_cols)

The submatrix corresponding to bicluster `i`.

Notes

-----

Works with sparse matrices. Only works if ``rows_`` and

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

base.py

base.py

Files

base.py

Latest commit

History

base.py

File metadata and controls