Skip to content

BUG: for several datasets, download_if_missing keyword was ignored. #7944

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 29, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sklearn/datasets/california_housing.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,12 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
data_home = get_data_home(data_home=data_home)
if not exists(data_home):
makedirs(data_home)

filepath = _pkl_filepath(data_home, TARGET_FILENAME)
if not exists(filepath):
if not download_if_missing:
raise IOError("Data not found and `download_if_missing` is False")

print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home))
archive_fileobj = BytesIO(urlopen(DATA_URL).read())
fileobj = tarfile.open(
Expand Down
3 changes: 3 additions & 0 deletions sklearn/datasets/covtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ def fetch_covtype(data_home=None, download_if_missing=True,

joblib.dump(X, samples_path, compress=9)
joblib.dump(y, targets_path, compress=9)
elif not available:
if not download_if_missing:
raise IOError("Data not found and `download_if_missing` is False")

try:
X, y
Expand Down
3 changes: 3 additions & 0 deletions sklearn/datasets/kddcup99.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,9 @@ def _fetch_brute_kddcup99(subset=None, data_home=None,

joblib.dump(X, samples_path, compress=0)
joblib.dump(y, targets_path, compress=0)
elif not available:
if not download_if_missing:
raise IOError("Data not found and `download_if_missing` is False")

try:
X, y
Expand Down
4 changes: 4 additions & 0 deletions sklearn/datasets/olivetti_faces.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
makedirs(data_home)
filepath = _pkl_filepath(data_home, TARGET_FILENAME)
if not exists(filepath):
if not download_if_missing:
raise IOError("Data not found and `download_if_missing` is False")

print('downloading Olivetti faces from %s to %s'
% (DATA_URL, data_home))
fhandle = urlopen(DATA_URL)
Expand All @@ -121,6 +124,7 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
del mfile
else:
faces = joblib.load(filepath)

# We want floating point data, but float32 is enough (there is only
# one byte of precision in the original uint8s anyway)
faces = np.float32(faces)
Expand Down
3 changes: 3 additions & 0 deletions sklearn/datasets/species_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,9 @@ def fetch_species_distributions(data_home=None,
archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME)

if not exists(archive_path):
if not download_if_missing:
raise IOError("Data not found and `download_if_missing` is False")

print('Downloading species data from %s to %s' % (SAMPLES_URL,
data_home))
X = np.load(BytesIO(urlopen(SAMPLES_URL).read()))
Expand Down
6 changes: 2 additions & 4 deletions sklearn/datasets/tests/test_covtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
Skipped if covtype is not already downloaded to data_home.
"""

import errno
from sklearn.datasets import fetch_covtype
from sklearn.utils.testing import assert_equal, SkipTest

Expand All @@ -15,9 +14,8 @@ def fetch(*args, **kwargs):
def test_fetch():
try:
data1 = fetch(shuffle=True, random_state=42)
except IOError as e:
if e.errno == errno.ENOENT:
raise SkipTest("Covertype dataset can not be loaded.")
except IOError:
raise SkipTest("Covertype dataset can not be loaded.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@agramfort I know it's an old code but why use a SkipTest rather than an assert_raise_message ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't know whether the call will succeed, right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of course. Thanks for the clarification and sorry for the misunderstanding @amueller.


data2 = fetch(shuffle=True, random_state=37)

Expand Down
6 changes: 2 additions & 4 deletions sklearn/datasets/tests/test_kddcup99.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,15 @@
scikit-learn data folder.
"""

import errno
from sklearn.datasets import fetch_kddcup99
from sklearn.utils.testing import assert_equal, SkipTest


def test_percent10():
try:
data = fetch_kddcup99(download_if_missing=False)
except IOError as e:
if e.errno == errno.ENOENT:
raise SkipTest("kddcup99 dataset can not be loaded.")
except IOError:
raise SkipTest("kddcup99 dataset can not be loaded.")

assert_equal(data.data.shape, (494021, 41))
assert_equal(data.target.shape, (494021,))
Expand Down